47
47
"backfill-left" ,
48
48
"backfill-final" ,
49
49
"upload" ,
50
+ "upload-to-kv" ,
50
51
"streaming" ,
51
52
"streaming-client" ,
52
53
"consistency-metrics-compute" ,
62
63
63
64
# Constants for supporting multiple spark versions.
64
65
SUPPORTED_SPARK = ["2.4.0" , "3.1.1" , "3.2.1" , "3.5.1" ]
65
- SCALA_VERSION_FOR_SPARK = {"2.4.0" : "2.11" , "3.1.1" : "2.12" , "3.2.1" : "2.13" , "3.5.1" : "2.12" }
66
+ SCALA_VERSION_FOR_SPARK = {"2.4.0" : "2.11" ,
67
+ "3.1.1" : "2.12" , "3.2.1" : "2.13" , "3.5.1" : "2.12" }
66
68
67
69
MODE_ARGS = {
68
70
"backfill" : OFFLINE_ARGS ,
69
71
"backfill-left" : OFFLINE_ARGS ,
70
72
"backfill-final" : OFFLINE_ARGS ,
71
73
"upload" : OFFLINE_ARGS ,
74
+ "upload-to-kv" : ONLINE_WRITE_ARGS ,
72
75
"stats-summary" : OFFLINE_ARGS ,
73
76
"log-summary" : OFFLINE_ARGS ,
74
77
"analyze" : OFFLINE_ARGS ,
88
91
ROUTES = {
89
92
"group_bys" : {
90
93
"upload" : "group-by-upload" ,
94
+ "upload-to-kv" : "groupby-upload-bulk-load" ,
91
95
"backfill" : "group-by-backfill" ,
92
96
"streaming" : "group-by-streaming" ,
93
97
"metadata-upload" : "metadata-upload" ,
123
127
APP_NAME_TEMPLATE = "chronon_{conf_type}_{mode}_{context}_{name}"
124
128
RENDER_INFO_DEFAULT_SCRIPT = "scripts/render_info.py"
125
129
130
+ # GCP DATAPROC SPECIFIC CONSTANTS
126
131
DATAPROC_ENTRY = "ai.chronon.integrations.cloud_gcp.DataprocSubmitter"
132
+ ZIPLINE_ONLINE_JAR_DEFAULT = "cloud_gcp-assembly-0.1.0-SNAPSHOT.jar"
133
+ ZIPLINE_ONLINE_CLASS_DEFAULT = "ai.chronon.integrations.cloud_gcp.GcpApiImpl"
127
134
128
135
129
136
def retry_decorator (retries = 3 , backoff = 20 ):
@@ -175,7 +182,8 @@ def download_only_once(url, path, skip_download=False):
175
182
path = path .strip ()
176
183
if os .path .exists (path ):
177
184
content_output = check_output ("curl -sI " + url ).decode ("utf-8" )
178
- content_length = re .search ("(content-length:\\ s)(\\ d+)" , content_output .lower ())
185
+ content_length = re .search (
186
+ "(content-length:\\ s)(\\ d+)" , content_output .lower ())
179
187
remote_size = int (content_length .group ().split ()[- 1 ])
180
188
local_size = int (check_output ("wc -c " + path ).split ()[0 ])
181
189
print (
@@ -189,7 +197,8 @@ def download_only_once(url, path, skip_download=False):
189
197
print ("Sizes match. Assuming it's already downloaded." )
190
198
should_download = False
191
199
if should_download :
192
- print ("Different file from remote at local: " + path + ". Re-downloading.." )
200
+ print ("Different file from remote at local: " +
201
+ path + ". Re-downloading.." )
193
202
check_call ("curl {} -o {} --connect-timeout 10" .format (url , path ))
194
203
else :
195
204
print ("No file at: " + path + ". Downloading.." )
@@ -213,7 +222,8 @@ def download_jar(
213
222
"https://s01.oss.sonatype.org/service/local/repositories/public/content"
214
223
)
215
224
url_prefix = maven_url_prefix if maven_url_prefix else default_url_prefix
216
- base_url = "{}/ai/chronon/spark_{}_{}" .format (url_prefix , jar_type , scala_version )
225
+ base_url = "{}/ai/chronon/spark_{}_{}" .format (
226
+ url_prefix , jar_type , scala_version )
217
227
print ("Downloading jar from url: " + base_url )
218
228
jar_path = os .environ .get ("CHRONON_DRIVER_JAR" , None )
219
229
if jar_path is None :
@@ -241,11 +251,15 @@ def download_jar(
241
251
scala_version = scala_version ,
242
252
jar_type = jar_type ,
243
253
)
244
- jar_path = os .path .join ("/tmp" , jar_url . split ( "/" )[ - 1 ] )
254
+ jar_path = os .path .join ("/tmp" , extract_filename_from_path ( jar_url ) )
245
255
download_only_once (jar_url , jar_path , skip_download )
246
256
return jar_path
247
257
248
258
259
+ def get_teams_json_file_path (repo_path ):
260
+ return os .path .join (repo_path , "teams.json" )
261
+
262
+
249
263
def set_runtime_env (params ):
250
264
"""
251
265
Setting the runtime environment variables.
@@ -276,10 +290,11 @@ def set_runtime_env(params):
276
290
if effective_mode and "streaming" in effective_mode :
277
291
effective_mode = "streaming"
278
292
if params ["repo" ]:
279
- teams_file = os . path . join (params ["repo" ], "teams.json" )
293
+ teams_file = get_teams_json_file_path (params ["repo" ])
280
294
if os .path .exists (teams_file ):
281
295
with open (teams_file , "r" ) as infile :
282
296
teams_json = json .load (infile )
297
+ # we should have a fallback if user wants to set to something else `default`
283
298
environment ["common_env" ] = teams_json .get ("default" , {}).get (
284
299
"common_env" , {}
285
300
)
@@ -320,7 +335,8 @@ def set_runtime_env(params):
320
335
"backfill-final" ,
321
336
]:
322
337
environment ["conf_env" ]["CHRONON_CONFIG_ADDITIONAL_ARGS" ] = (
323
- " " .join (custom_json (conf_json ).get ("additional_args" , []))
338
+ " " .join (custom_json (conf_json ).get (
339
+ "additional_args" , []))
324
340
)
325
341
environment ["cli_args" ]["APP_NAME" ] = APP_NAME_TEMPLATE .format (
326
342
mode = effective_mode ,
@@ -333,7 +349,8 @@ def set_runtime_env(params):
333
349
)
334
350
# fall-back to prod env even in dev mode when dev env is undefined.
335
351
environment ["production_team_env" ] = (
336
- teams_json [team ].get ("production" , {}).get (effective_mode , {})
352
+ teams_json [team ].get ("production" , {}).get (
353
+ effective_mode , {})
337
354
)
338
355
# By default use production env.
339
356
environment ["default_env" ] = (
@@ -354,7 +371,8 @@ def set_runtime_env(params):
354
371
for k in [
355
372
"chronon" ,
356
373
conf_type ,
357
- params ["mode" ].replace ("-" , "_" ) if params ["mode" ] else None ,
374
+ params ["mode" ].replace (
375
+ "-" , "_" ) if params ["mode" ] else None ,
358
376
]
359
377
if k is not None
360
378
]
@@ -402,15 +420,17 @@ def __init__(self, args, jar_path):
402
420
403
421
if self .conf :
404
422
try :
405
- self .context , self .conf_type , self .team , _ = self .conf .split ("/" )[- 4 :]
423
+ self .context , self .conf_type , self .team , _ = self .conf .split (
424
+ "/" )[- 4 :]
406
425
except Exception as e :
407
426
logging .error (
408
427
"Invalid conf path: {}, please ensure to supply the relative path to zipline/ folder" .format (
409
428
self .conf
410
429
)
411
430
)
412
431
raise e
413
- possible_modes = list (ROUTES [self .conf_type ].keys ()) + UNIVERSAL_ROUTES
432
+ possible_modes = list (
433
+ ROUTES [self .conf_type ].keys ()) + UNIVERSAL_ROUTES
414
434
assert (
415
435
args ["mode" ] in possible_modes
416
436
), "Invalid mode:{} for conf:{} of type:{}, please choose from {}" .format (
@@ -520,8 +540,6 @@ def run(self):
520
540
)
521
541
command_list .append (command )
522
542
else :
523
- # offline mode
524
-
525
543
# we'll always download the jar for now so that we can pull
526
544
# in any fixes or latest changes
527
545
dataproc_jar = download_dataproc_jar (temp_dir ,
@@ -544,7 +562,8 @@ def run(self):
544
562
script = self .spark_submit ,
545
563
jar = self .jar_path ,
546
564
subcommand = ROUTES [self .conf_type ][self .mode ],
547
- args = self ._gen_final_args (start_ds = start_ds , end_ds = end_ds ),
565
+ args = self ._gen_final_args (
566
+ start_ds = start_ds , end_ds = end_ds ),
548
567
additional_args = os .environ .get (
549
568
"CHRONON_CONFIG_ADDITIONAL_ARGS" , ""
550
569
),
@@ -563,11 +582,19 @@ def run(self):
563
582
# when we include the gcs file path as part of dataproc,
564
583
# the file is copied to root and not the complete path
565
584
# is copied.
566
- override_conf_path = self .conf .split ("/" )[- 1 ]),
585
+ override_conf_path = extract_filename_from_path (
586
+ self .conf ) if self .conf else None ),
567
587
additional_args = os .environ .get (
568
588
"CHRONON_CONFIG_ADDITIONAL_ARGS" , ""
569
589
),
570
590
)
591
+ local_files_to_upload_to_gcs = []
592
+ if self .conf :
593
+ local_files_to_upload_to_gcs .append (
594
+ self .conf )
595
+ # upload teams.json to gcs
596
+ local_files_to_upload_to_gcs .append (
597
+ get_teams_json_file_path (self .repo ))
571
598
572
599
dataproc_command = generate_dataproc_submitter_args (
573
600
local_files_to_upload_to_gcs = [self .conf ],
@@ -603,19 +630,28 @@ def run(self):
603
630
# does get reflected on GCS. But when we include the gcs file
604
631
# path as part of dataproc, the file is copied to root and
605
632
# not the complete path is copied.
606
- override_conf_path = self .conf .split ("/" )[- 1 ]),
633
+ override_conf_path = extract_filename_from_path (
634
+ self .conf ) if self .conf else None ),
607
635
additional_args = os .environ .get (
608
636
"CHRONON_CONFIG_ADDITIONAL_ARGS" , ""
609
637
),
610
638
)
639
+ local_files_to_upload_to_gcs = []
640
+ if self .conf :
641
+ local_files_to_upload_to_gcs .append (self .conf )
642
+
643
+ # upload teams.json to gcs
644
+ local_files_to_upload_to_gcs .append (
645
+ get_teams_json_file_path (self .repo ))
611
646
612
647
dataproc_command = generate_dataproc_submitter_args (
613
648
# for now, self.conf is the only local file that requires uploading to gcs
614
- local_files_to_upload_to_gcs = [ self . conf ] ,
649
+ local_files_to_upload_to_gcs = local_files_to_upload_to_gcs ,
615
650
user_args = user_args
616
651
)
617
652
command = f"java -cp { dataproc_jar } { DATAPROC_ENTRY } { dataproc_command } "
618
653
command_list .append (command )
654
+
619
655
if len (command_list ) > 1 :
620
656
# parallel backfill mode
621
657
with multiprocessing .Pool (processes = int (self .parallelism )) as pool :
@@ -632,16 +668,23 @@ def _gen_final_args(self, start_ds=None, end_ds=None, override_conf_path=None):
632
668
base_args = MODE_ARGS [self .mode ].format (
633
669
conf_path = override_conf_path if override_conf_path else self .conf ,
634
670
ds = end_ds if end_ds else self .ds ,
635
- online_jar = self .online_jar ,
636
- online_class = self .online_class ,
671
+ online_jar = self .online_jar if not self . dataproc else ZIPLINE_ONLINE_JAR_DEFAULT ,
672
+ online_class = self .online_class if not self . dataproc else ZIPLINE_ONLINE_CLASS_DEFAULT ,
637
673
)
638
674
override_start_partition_arg = (
639
675
" --start-partition-override=" + start_ds if start_ds else ""
640
676
)
641
- final_args = base_args + " " + str (self .args ) + override_start_partition_arg
677
+
678
+ final_args = base_args + " " + \
679
+ str (self .args ) + override_start_partition_arg
680
+
642
681
return final_args
643
682
644
683
684
+ def extract_filename_from_path (path ):
685
+ return path .split ("/" )[- 1 ]
686
+
687
+
645
688
def split_date_range (start_date , end_date , parallelism ):
646
689
start_date = datetime .strptime (start_date , "%Y-%m-%d" )
647
690
end_date = datetime .strptime (end_date , "%Y-%m-%d" )
@@ -653,7 +696,8 @@ def split_date_range(start_date, end_date, parallelism):
653
696
654
697
# Check if parallelism is greater than total_days
655
698
if parallelism > total_days :
656
- raise ValueError ("Parallelism should be less than or equal to total days" )
699
+ raise ValueError (
700
+ "Parallelism should be less than or equal to total days" )
657
701
658
702
split_size = total_days // parallelism
659
703
date_ranges = []
@@ -710,24 +754,27 @@ def get_customer_id() -> str:
710
754
def get_gcp_project_id () -> str :
711
755
gcp_project_id = os .environ .get ('ZIPLINE_GCP_PROJECT_ID' )
712
756
if not gcp_project_id :
713
- raise ValueError ('Please set ZIPLINE_GCP_PROJECT_ID environment variable' )
757
+ raise ValueError (
758
+ 'Please set ZIPLINE_GCP_PROJECT_ID environment variable' )
714
759
return gcp_project_id
715
760
716
761
717
762
def generate_dataproc_submitter_args (local_files_to_upload_to_gcs : List [str ], user_args : str ):
718
763
customer_warehouse_bucket_name = f"zipline-warehouse-{ get_customer_id ()} "
719
764
720
765
gcs_files = []
721
- for f in local_files_to_upload_to_gcs :
766
+ for source_file in local_files_to_upload_to_gcs :
722
767
# upload to `metadata` folder
723
- destination_file_path = f"metadata/{ f } "
724
- gcs_files .append (upload_gcs_blob (customer_warehouse_bucket_name , f , destination_file_path ))
768
+ destination_file_path = f"metadata/{ extract_filename_from_path (source_file )} "
769
+ gcs_files .append (upload_gcs_blob (
770
+ customer_warehouse_bucket_name , source_file , destination_file_path ))
725
771
726
772
# we also want the additional-confs included here. it should already be in the bucket
727
773
728
774
zipline_artifacts_bucket_prefix = 'gs://zipline-artifacts'
729
775
730
- gcs_files .append (f"{ zipline_artifacts_bucket_prefix } -{ get_customer_id ()} /confs/additional-confs.yaml" )
776
+ gcs_files .append (
777
+ f"{ zipline_artifacts_bucket_prefix } -{ get_customer_id ()} /confs/additional-confs.yaml" )
731
778
732
779
gcs_file_args = "," .join (gcs_files )
733
780
@@ -750,7 +797,8 @@ def download_dataproc_jar(destination_dir: str, customer_id: str):
750
797
source_blob_name = f"jars/{ file_name } "
751
798
dataproc_jar_destination_path = f"{ destination_dir } /{ file_name } "
752
799
753
- download_gcs_blob (bucket_name , source_blob_name , dataproc_jar_destination_path )
800
+ download_gcs_blob (bucket_name , source_blob_name ,
801
+ dataproc_jar_destination_path )
754
802
return dataproc_jar_destination_path
755
803
756
804
0 commit comments