Skip to content
This repository was archived by the owner on Mar 3, 2023. It is now read-only.

Commit e1dda9a

Browse files
authored
Updates to Kubernetes scheduler to fix dockerenv issue (#3550)
* Updates to Kubernetes scheduler to create Headless service * Use FQDN instead of hostname when running in Kubernetes environment
1 parent eb5090b commit e1dda9a

File tree

9 files changed

+2856
-2847
lines changed

9 files changed

+2856
-2847
lines changed

WORKSPACE

+2-3
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jetty_version = "9.4.6.v20170531"
5454

5555
jersey_version = "2.25.1"
5656

57-
kubernetes_client_version = "7.0.0"
57+
kubernetes_client_version = "8.0.0"
5858

5959
load("@rules_jvm_external//:defs.bzl", "maven_install")
6060
load("@rules_jvm_external//:specs.bzl", "maven")
@@ -73,15 +73,14 @@ maven_install(
7373
"org.apache.mesos:mesos:0.22.0",
7474
"com.hashicorp.nomad:nomad-sdk:0.7.0",
7575
"org.apache.hadoop:hadoop-core:0.20.2",
76-
"org.apache.pulsar:pulsar-client:1.19.0-incubating",
76+
"org.apache.pulsar:pulsar-client:jar:shaded:1.19.0-incubating",
7777
"org.apache.kafka:kafka-clients:2.2.0",
7878
"com.google.apis:google-api-services-storage:v1-rev108-" + google_client_version,
7979
"org.apache.reef:reef-runtime-yarn:" + reef_version,
8080
"org.apache.reef:reef-runtime-local:" + reef_version,
8181
"org.apache.httpcomponents:httpclient:" + http_client_version,
8282
"org.apache.httpcomponents:httpmime:" + http_client_version,
8383
"com.google.apis:google-api-services-storage:v1-rev108-1.22.0",
84-
"io.kubernetes:client-java:7.0.0",
8584
"com.microsoft.dhalion:dhalion:0.2.3",
8685
"org.objenesis:objenesis:2.1",
8786
"org.ow2.asm:asm-all:5.1",

examples/src/java/BUILD

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ java_binary(
2929
"//heron/common/src/java:basics-java",
3030
"//heron/simulator/src/java:simulator-java",
3131
"//third_party/java:kryo",
32-
"@maven//:org_apache_pulsar_pulsar_client",
32+
"@maven//:org_apache_pulsar_pulsar_client_shaded",
3333
],
3434
)
3535

heron/executor/src/python/heron_executor.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ def log_pid_for_process(process_name, pid):
140140
def is_docker_environment():
141141
return os.path.isfile('/.dockerenv')
142142

143+
def is_kubernetes_environment():
144+
return 'POD_NAME' in os.environ
145+
143146
def stdout_log_fn(cmd):
144147
"""Simple function callback that is used to log the streaming output of a subprocess command
145148
:param cmd: the name of the command which will be added to the log line
@@ -235,7 +238,9 @@ def init_from_parsed_args(self, parsed_args):
235238
# Needed for Docker environments since the hostname of a docker container is the container's
236239
# id within docker, rather than the host's hostname. NOTE: this 'HOST' env variable is not
237240
# guaranteed to be set in all Docker executor environments (outside of Marathon)
238-
if is_docker_environment():
241+
if is_kubernetes_environment():
242+
self.master_host = socket.getfqdn()
243+
elif is_docker_environment():
239244
self.master_host = os.environ.get('HOST') if 'HOST' in os.environ else socket.gethostname()
240245
else:
241246
self.master_host = socket.gethostname()

heron/schedulers/src/java/org/apache/heron/scheduler/kubernetes/KubernetesCompat.java

-79
This file was deleted.

heron/schedulers/src/java/org/apache/heron/scheduler/kubernetes/KubernetesScheduler.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ public class KubernetesScheduler implements IScheduler, IScalable {
5050
private UpdateTopologyManager updateTopologyManager;
5151

5252
protected KubernetesController getController() {
53-
return new AppsV1Controller(configuration, runtimeConfiguration);
53+
return new V1Controller(configuration, runtimeConfiguration);
5454
}
5555

5656
@Override

heron/schedulers/src/java/org/apache/heron/scheduler/kubernetes/AppsV1Controller.java heron/schedulers/src/java/org/apache/heron/scheduler/kubernetes/V1Controller.java

+67-21
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import io.kubernetes.client.openapi.ApiException;
4949
import io.kubernetes.client.openapi.Configuration;
5050
import io.kubernetes.client.openapi.apis.AppsV1Api;
51+
import io.kubernetes.client.openapi.apis.CoreV1Api;
5152
import io.kubernetes.client.openapi.models.V1Container;
5253
import io.kubernetes.client.openapi.models.V1ContainerPort;
5354
import io.kubernetes.client.openapi.models.V1EnvVar;
@@ -58,6 +59,8 @@
5859
import io.kubernetes.client.openapi.models.V1PodSpec;
5960
import io.kubernetes.client.openapi.models.V1PodTemplateSpec;
6061
import io.kubernetes.client.openapi.models.V1ResourceRequirements;
62+
import io.kubernetes.client.openapi.models.V1Service;
63+
import io.kubernetes.client.openapi.models.V1ServiceSpec;
6164
import io.kubernetes.client.openapi.models.V1StatefulSet;
6265
import io.kubernetes.client.openapi.models.V1StatefulSetSpec;
6366
import io.kubernetes.client.openapi.models.V1Toleration;
@@ -66,21 +69,23 @@
6669

6770
import okhttp3.Response;
6871

69-
public class AppsV1Controller extends KubernetesController {
72+
public class V1Controller extends KubernetesController {
7073

7174
private static final Logger LOG =
72-
Logger.getLogger(AppsV1Controller.class.getName());
75+
Logger.getLogger(V1Controller.class.getName());
7376

7477
private static final String ENV_SHARD_ID = "SHARD_ID";
7578

7679
private final AppsV1Api appsClient;
80+
private final CoreV1Api coreClient;
7781

78-
AppsV1Controller(Config configuration, Config runtimeConfiguration) {
82+
V1Controller(Config configuration, Config runtimeConfiguration) {
7983
super(configuration, runtimeConfiguration);
8084
try {
8185
final ApiClient apiClient = io.kubernetes.client.util.Config.defaultClient();
8286
Configuration.setDefaultApiClient(apiClient);
8387
appsClient = new AppsV1Api(apiClient);
88+
coreClient = new CoreV1Api(apiClient);
8489
} catch (IOException e) {
8590
LOG.log(Level.SEVERE, "Failed to setup Kubernetes client" + e);
8691
throw new RuntimeException(e);
@@ -96,6 +101,16 @@ boolean submit(PackingPlan packingPlan) {
96101

97102
final Resource containerResource = getContainerResource(packingPlan);
98103

104+
final V1Service topologyService = createTopologyyService();
105+
try {
106+
final V1Service response =
107+
coreClient.createNamespacedService(getNamespace(), topologyService, null,
108+
null, null);
109+
} catch (ApiException e) {
110+
KubernetesUtils.logExceptionWithDetails(LOG, "Error creating topology service", e);
111+
throw new TopologySubmissionException(e.getMessage());
112+
}
113+
99114
// find the max number of instances in a container so we can open
100115
// enough ports if remote debugging is enabled.
101116
int numberOfInstances = 0;
@@ -118,11 +133,9 @@ boolean submit(PackingPlan packingPlan) {
118133

119134
@Override
120135
boolean killTopology() {
121-
return
122-
isStatefulSet()
123-
? deleteStatefulSet()
124-
:
125-
new KubernetesCompat().killTopology(getKubernetesUri(), getTopologyName(), getNamespace());
136+
deleteStatefulSet();
137+
deleteService();
138+
return true;
126139
}
127140

128141
@Override
@@ -199,6 +212,31 @@ V1StatefulSet getStatefulSet() throws ApiException {
199212
null, null, null);
200213
}
201214

215+
boolean deleteService() {
216+
try {
217+
final Response response = coreClient.deleteNamespacedServiceCall(getTopologyName(),
218+
getNamespace(), null, null, 0, null,
219+
KubernetesConstants.DELETE_OPTIONS_PROPAGATION_POLICY, null, null).execute();
220+
221+
if (response.isSuccessful()) {
222+
LOG.log(Level.INFO, "Headless Service for the Job [" + getTopologyName()
223+
+ "] in namespace [" + getNamespace() + "] is deleted.");
224+
return true;
225+
} else {
226+
LOG.log(Level.SEVERE, "Error when deleting the Service of the job ["
227+
+ getTopologyName() + "] in namespace [" + getNamespace() + "]");
228+
LOG.log(Level.SEVERE, "Error killing topoogy message:" + response.message());
229+
KubernetesUtils.logResponseBodyIfPresent(LOG, response);
230+
231+
throw new TopologyRuntimeManagementException(
232+
KubernetesUtils.errorMessageFromResponse(response));
233+
}
234+
} catch (IOException | ApiException e) {
235+
KubernetesUtils.logExceptionWithDetails(LOG, "Error deleting topology service", e);
236+
return false;
237+
}
238+
}
239+
202240
boolean deleteStatefulSet() {
203241
try {
204242
final Response response = appsClient.deleteNamespacedStatefulSetCall(getTopologyName(),
@@ -211,7 +249,7 @@ boolean deleteStatefulSet() {
211249
return true;
212250
} else {
213251
LOG.log(Level.SEVERE, "Error when deleting the StatefulSet of the job ["
214-
+ getTopologyName() + "]: in namespace [" + getNamespace() + "]");
252+
+ getTopologyName() + "] in namespace [" + getNamespace() + "]");
215253
LOG.log(Level.SEVERE, "Error killing topology message: " + response.message());
216254
KubernetesUtils.logResponseBodyIfPresent(LOG, response);
217255

@@ -224,18 +262,6 @@ boolean deleteStatefulSet() {
224262
}
225263
}
226264

227-
boolean isStatefulSet() {
228-
try {
229-
final V1StatefulSet response =
230-
appsClient.readNamespacedStatefulSet(getTopologyName(), getNamespace(),
231-
null, null, null);
232-
return response.getKind().equals("StatefulSet");
233-
} catch (ApiException e) {
234-
LOG.warning("isStatefulSet check " + e.getMessage());
235-
}
236-
return false;
237-
}
238-
239265
protected List<String> getExecutorCommand(String containerId) {
240266
final Map<ExecutorPort, String> ports =
241267
KubernetesConstants.EXECUTOR_PORTS.entrySet()
@@ -262,6 +288,26 @@ private static String setShardIdEnvironmentVariableCommand() {
262288
return String.format("%s=${POD_NAME##*-} && echo shardId=${%s}", ENV_SHARD_ID, ENV_SHARD_ID);
263289
}
264290

291+
private V1Service createTopologyyService() {
292+
final String topologyName = getTopologyName();
293+
final Config runtimeConfiguration = getRuntimeConfiguration();
294+
295+
final V1Service service = new V1Service();
296+
297+
// setup service metadata
298+
final V1ObjectMeta objectMeta = new V1ObjectMeta();
299+
objectMeta.name(topologyName);
300+
service.setMetadata(objectMeta);
301+
302+
// create the headless service
303+
final V1ServiceSpec serviceSpec = new V1ServiceSpec();
304+
serviceSpec.clusterIP("None");
305+
serviceSpec.setSelector(getMatchLabels(topologyName));
306+
307+
service.setSpec(serviceSpec);
308+
309+
return service;
310+
}
265311

266312
private V1StatefulSet createStatefulSet(Resource containerResource, int numberOfInstances) {
267313
final String topologyName = getTopologyName();

heron/stmgr/src/cpp/manager/ckptmgr-client.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@ CkptMgrClient::CkptMgrClient(std::shared_ptr<EventLoop> eventloop, const Network
4545
ckptmgr_id_(_ckptmgr_id),
4646
stmgr_id_(_stmgr_id),
4747
quit_(false),
48-
pplan_(nullptr),
4948
ckpt_saved_watcher_(_ckpt_saved_watcher),
5049
ckpt_get_watcher_(_ckpt_get_watcher),
51-
register_watcher_(_register_watcher) {
50+
register_watcher_(_register_watcher),
51+
pplan_(nullptr) {
5252

5353
// TODO(nlu): take the value from config
5454
reconnect_cpktmgr_interval_sec_ = 10;

heron/stmgr/tests/cpp/server/stateful-restorer_unittest.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ TEST(StatefulRestorer, deadinstances) {
346346
// Send notification that some tasks have recovered
347347
EXPECT_GT(local_tasks.size(), 1);
348348
bool first = true;
349-
int32_t troublesome_task;
349+
int32_t troublesome_task = 0;
350350
for (auto task : local_tasks) {
351351
if (first) {
352352
first = false;
@@ -426,7 +426,7 @@ TEST(StatefulRestorer, deadckptmgr) {
426426
// Send notification that some tasks have recovered
427427
EXPECT_GT(local_tasks.size(), 1);
428428
bool first = true;
429-
int32_t troublesome_task;
429+
int32_t troublesome_task = 0;
430430
// ckpt delivers some checkpoints
431431
for (auto task : local_tasks) {
432432
if (first) {

0 commit comments

Comments
 (0)