3
3
import os
4
4
from mlcomp .task .storage import Storage
5
5
from mlcomp .utils .config import load_ordered_yaml
6
- from mlcomp .task .executors import Executor
7
6
from mlcomp .task .app import app
8
7
import socket
9
- from multiprocessing import cpu_count , Process
10
- import torch
11
-
8
+ from multiprocessing import cpu_count
12
9
from mlcomp .utils .misc import dict_func
13
10
import psutil
14
11
import GPUtil
15
12
import numpy as np
16
13
from mlcomp .task .tasks import execute_by_id
14
+ from mlcomp .utils .schedule import start_schedule
15
+
17
16
18
17
@click .group ()
19
18
def main ():
@@ -46,28 +45,43 @@ def worker_usage():
46
45
@main .command ()
47
46
@click .argument ('number' , type = int )
48
47
def worker (number ):
48
+ docker_img = os .getenv ('DOCKER_IMG' , 'default' )
49
+ argv = [
50
+ 'worker' ,
51
+ '--loglevel=INFO' ,
52
+ '-P=solo' ,
53
+ f'-n={ number } ' ,
54
+ '-O fair' ,
55
+ '-c=1' ,
56
+ '--prefetch-multiplier=1' ,
57
+ '-Q' ,
58
+ f'{ socket .gethostname ()} _{ docker_img } '
59
+ ]
60
+ app .worker_main (argv )
61
+
62
+ @main .command ()
63
+ def worker_supervisor ():
49
64
provider = ComputerProvider ()
50
65
tot_m , used_m , free_m = map (int , os .popen ('free -t -m' ).readlines ()[- 1 ].split ()[1 :])
51
-
52
- computer = Computer (name = socket .gethostname (), gpu = torch .cuda .device_count (), cpu = cpu_count (), memory = tot_m )
66
+ computer = Computer (name = socket .gethostname (), gpu = len (GPUtil .getGPUs ()), cpu = cpu_count (), memory = tot_m )
53
67
provider .create_or_update (computer , 'name' )
54
68
55
- # start_schedule([(worker_usage, 60)])
69
+ start_schedule ([(worker_usage , 60 )])
56
70
71
+ docker_img = os .getenv ('DOCKER_IMG' , 'default' )
57
72
argv = [
58
73
'worker' ,
59
74
'--loglevel=INFO' ,
60
75
'-P=solo' ,
61
- f'-n={ number } ' ,
76
+ f'-n=1 ' ,
62
77
'-O fair' ,
63
78
'-c=1' ,
64
79
'--prefetch-multiplier=1' ,
65
80
'-Q' ,
66
- socket .gethostname ()
81
+ f' { socket .gethostname ()} _ { docker_img } _supervisor'
67
82
]
68
83
app .worker_main (argv )
69
84
70
-
71
85
@main .command ()
72
86
def start_server ():
73
87
from mlcomp .server .back .app import start_server as _start_server
@@ -101,7 +115,8 @@ def _dag(config: str, debug: bool = False):
101
115
102
116
folder = os .path .join (os .getcwd (), info ['folder' ])
103
117
project = ProjectProvider ().by_name (info ['project' ]).id
104
- dag = dag_provider .add (Dag (config = config_text , project = project , name = info ['name' ]))
118
+ dag = dag_provider .add (Dag (config = config_text , project = project ,
119
+ name = info ['name' ], docker_img = info .get ('docker_img' )))
105
120
storage .upload (folder , dag )
106
121
107
122
created = OrderedDict ()
@@ -112,8 +127,6 @@ def _dag(config: str, debug: bool = False):
112
127
for d in v ['depends' ]:
113
128
if d not in executors :
114
129
raise Exception (f'Executor { k } depend on { d } which does not exist' )
115
- if not Executor .is_registered (executors [d ]['type' ]):
116
- raise Exception (f'Executor { d } has not been registered' )
117
130
118
131
valid = valid and d in created
119
132
if valid :
@@ -166,8 +179,6 @@ def execute(config: str):
166
179
for d in v ['depends' ]:
167
180
if d not in executors :
168
181
raise Exception (f'Executor { k } depend on { d } which does not exist' )
169
- if not Executor .is_registered (executors [d ]['type' ]):
170
- raise Exception (f'Executor { d } has not been registered' )
171
182
172
183
valid = valid and d in created
173
184
if valid :
0 commit comments