Direct the content of log files to stdout of pods in kubernetes deployment (#2871)

siyuan0322 · web-flow · commit 559b084b6833 · 2023-06-13T20:50:23.000+08:00
Fixes #2357
diff --git a/README.md b/README.md
@@ -347,7 +347,7 @@ sudo make install
 
 ### Building Docker images
 
-GraphScope ships with a [Dockerfile](k8s/graphscope.Dockerfile) that can build docker images for releasing. The images are built on a `builder` image with all dependencies installed and copied to
+GraphScope ships with a [Dockerfile](k8s/dockerfiles/graphscope-dev.Dockerfile) that can build docker images for releasing. The images are built on a `builder` image with all dependencies installed and copied to
 a `runtime-base` image. To build images with latest version of GraphScope, go to the `k8s/internal` directory under root directory and run this command.
 
 ```bash
diff --git a/coordinator/gscoordinator/cluster_builder.py b/coordinator/gscoordinator/cluster_builder.py
@@ -163,6 +163,7 @@ def __init__(
 
         self._vineyard_requests = {"cpu": vineyard_cpu, "memory": vineyard_mem}
         self._analytical_requests = {"cpu": engine_cpu, "memory": engine_mem}
+        # Should give executor a smaller value, since it doesn't need to load the graph
         self._executor_requests = {"cpu": "2000m", "memory": engine_mem}
         self._learning_requests = {"cpu": "1000m", "memory": "256Mi"}
         self._frontend_requests = {"cpu": "200m", "memory": "512Mi"}
@@ -265,10 +266,15 @@ def get_engine_container_helper(
         )
         return container
 
+    def _get_tail_if_exists_cmd(self, fname: str):
+        return (
+            f"while true; do if [ -e {fname} ]; then tail -f {fname}; fi; sleep 1; done"
+        )
+
     def get_analytical_container(self, volume_mounts, with_java=False):
         name = self.analytical_container_name
         image = self._analytical_image if not with_java else self._analytical_java_image
-        args = ["tail", "-f", "/dev/null"]
+        args = ["bash", "-c", self._get_tail_if_exists_cmd("/tmp/grape_engine.INFO")]
         container = self.get_engine_container_helper(
             name,
             image,
@@ -292,7 +298,11 @@ def get_analytical_container(self, volume_mounts, with_java=False):
     def get_interactive_executor_container(self, volume_mounts):
         name = self.interactive_executor_container_name
         image = self._interactive_executor_image
-        args = ["tail", "-f", "/dev/null"]
+        args = [
+            "bash",
+            "-c",
+            self._get_tail_if_exists_cmd("/var/log/graphscope/current/executor.log"),
+        ]
         container = self.get_engine_container_helper(
             name,
             image,
@@ -445,7 +455,7 @@ def get_engine_headless_service(self):
             "ClusterIP", ports, self._engine_labels, None
         )
 
-        # Necessary, create a headless service for statefulset
+        # Necessary, create a headless service for statefulsets
         service_spec.cluster_ip = "None"
         service = ResourceBuilder.get_service(
             self._namespace, name, service_spec, self._engine_labels
@@ -534,7 +544,11 @@ def get_graphlearn_service_endpoint(self, api_client, object_id, pod_host_ip_lis
     def get_interactive_frontend_container(self):
         name = self.interactive_frontend_container_name
         image = self._interactive_frontend_image
-        args = ["tail", "-f", "/dev/null"]
+        args = [
+            "bash",
+            "-c",
+            self._get_tail_if_exists_cmd("/var/log/graphscope/current/frontend.log"),
+        ]
         container = kube_client.V1Container(name=name, image=image, args=args)
         container.image_pull_policy = self._image_pull_policy
         container.resources = ResourceBuilder.get_resources(
diff --git a/docs/development/how_to_debug.md b/docs/development/how_to_debug.md
@@ -2,9 +2,37 @@ This document shows how to debugging GraphScope under various conditions.
 
 ### Debugging on local deployment
 
+## Find the logs
+
+Most of the logs will be streamed through the stdout of client, you could control the log level by 
+
+```python
+import graphscope
+graphscope.set_option(show_log=True)
+graphscope.set_option(log_level='DEBUG')  # could also be INFO, ERROR
+```
+
+As you may know, GraphScope is composed of three engines, where the detailed log location of each engine is
+
+- Analytical Engine: `/tmp/grape_engine.INFO`
+- Interactive Engine: Inside `/var/log/graphscope/` or `$HOME/log/graphscope` if GraphScope doesn't have permission of `/var/log`. You may find several folders named with a long number, which is the object id of the graph. There is also a `current` folder links to the log folder of latest created interactive instance.
+- Learning Engine: `graphlearn.INFO` in the current directory.
+
+
 
 ### Debugging on Kubernetes deployment
 
+## Find the logs
+
+In kubernetes environment, besides most of the logs still output to console, you could find detailed logs in each pod's stdout, or files inside each pods.
+
+Note: You could use `kubectl logs <pod>` to inspect the stdout of the pod. Use `kubectl logs <pod> -c <container>` to inspect a specific container inside the pod.
+
+- Coordinator: The stdout of coordinator pod.
+- Analytical Engine: The stdout engine container in the engine pod.
+- Interactive Engine: The stdout of executor container in the engine pod for the executor log. And the stdout of interactive-frontend pod for the frontend. The log files resides in the `/var/log/graphscope` of each container, respectively.
+
+
 ## Commands for Debugging
 
 Here is list with commands usually used for checking the status of the GraphScope deployment on K8s.
diff --git a/docs/frequently_asked_questions.rst b/docs/frequently_asked_questions.rst
@@ -32,7 +32,8 @@ If you don't find an answer to your question here, feel free to file an `Issues`
 
        graphscope.set_option(show_log=True)
 
-    If you are running GraphScope in k8s, you can use `kubectl describe/logs <https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands>`_ to check the log/status of the cluster. If the disk storage is accessible(on local or via Pods), you may also find logs in `/tmp/gs/runtime/logs`.
+    If you are running GraphScope in k8s, you can use `kubectl describe/logs <https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands>`_ to check the log/status of the pods of GraphScope. 
+    If the disk storage is accessible(on local or via Pods), you may also find logs in `/var/log/graphscope/current` or `$HOME/.local/log/graphscope`.
 
 
 4. Why I find more Pods than expected with command `kubectl get pod`?
diff --git a/interactive_engine/assembly/src/bin/graphscope/giectl b/interactive_engine/assembly/src/bin/graphscope/giectl
@@ -80,6 +80,10 @@ start_frontend() {
   declare -r pid_dir=${GRAPHSCOPE_RUNTIME}/pid/${object_id}
   mkdir -p ${log_dir} ${config_dir} ${pid_dir}
 
+  # make a "current" link
+  unlink ${GS_LOG}/current || true
+  ln -s ${log_dir} ${GS_LOG}/current
+
   declare java_opt="-server
             -verbose:gc
             -Xloggc:${log_dir}/frontend.gc.log
@@ -149,6 +153,10 @@ start_executor() {
   export LD_LIBRARY_PATH=${GRAPHSCOPE_HOME}/lib:${LD_LIBRARY_PATH}
   export DYLD_LIBRARY_PATH=${GRAPHSCOPE_HOME}/lib:${DYLD_LIBRARY_PATH}
 
+  # make a "current" link
+  unlink ${GS_LOG}/current || true
+  ln -s ${log_dir} ${GS_LOG}/current
+
   # set executor config file
   sed -e "s@GRAPH_NAME@${object_id}@g" \
       -e "s@VINEYARD_OBJECT_ID@${object_id}@g" \
@@ -206,9 +214,6 @@ create_gremlin_instance_on_local() {
   mkdir -p ${GS_LOG}
 
   declare -r log_dir=${GS_LOG}/${object_id}
-  # make a "current" link
-  unlink ${GS_LOG}/current || true
-  ln -s ${log_dir} ${GS_LOG}/current
 
   # Frontend use executor rpc port
   network_servers=""