-
Notifications
You must be signed in to change notification settings - Fork 0
Connect GroupByUploadToKVBulkLoad from Driver.scala to run.py #221
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8e9091f
44ceef1
d190ec0
69ddb67
041811c
c175ce9
bae2471
752a454
42af777
4acbd0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,66 +1,70 @@ | ||
{ | ||
"default": { | ||
"table_properties": { | ||
"source": "chronon" | ||
}, | ||
"common_env": { | ||
"VERSION": "latest", | ||
"SPARK_SUBMIT_PATH": "[TODO]/path/to/spark-submit", | ||
"JOB_MODE": "local[*]", | ||
"HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", | ||
"CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", | ||
"CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host=<YOUR_HOST> -Zkv-port=<YOUR_PORT>", | ||
"PARTITION_COLUMN": "ds", | ||
"PARTITION_FORMAT": "yyyy-MM-dd" | ||
}, | ||
"production": { | ||
"backfill" : { | ||
"EXECUTOR_CORES": "1", | ||
"DRIVER_MEMORY": "15G", | ||
"EXECUTOR_MEMORY": "8G", | ||
"PARALLELISM": "4000", | ||
"MAX_EXECUTORS": "1000" | ||
}, | ||
"upload" : { | ||
"EXECUTOR_CORES": "1", | ||
"EXECUTOR_MEMORY": "8G", | ||
"PARALLELISM": "1000", | ||
"MAX_EXECUTORS": "1000" | ||
}, | ||
"streaming" : { | ||
"EXECUTOR_CORES": "2", | ||
"EXECUTOR_MEMORY": "4G", | ||
"PARALLELISM": "16" | ||
} | ||
} | ||
"default": { | ||
"table_properties": { | ||
"source": "chronon" | ||
}, | ||
"sample_team": { | ||
"description": "Team description", | ||
"namespace": "chronon_db", | ||
"user": "# TODO: ldap user name to run the jobs as, from airflow or your own scheduler", | ||
"production": { | ||
"backfill" : { | ||
"EXECUTOR_CORES": "4" | ||
} | ||
}, | ||
"dev": { | ||
"backfill" : { | ||
"EXECUTOR_CORES": "2", | ||
"DRIVER_MEMORY": "30G" | ||
} | ||
} | ||
"common_env": { | ||
"VERSION": "latest", | ||
"SPARK_SUBMIT_PATH": "[TODO]/path/to/spark-submit", | ||
"JOB_MODE": "local[*]", | ||
"HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", | ||
"CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", | ||
"CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host=<YOUR_HOST> -Zkv-port=<YOUR_PORT>", | ||
Comment on lines
+8
to
+12
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove TODO placeholders before production deployment. These placeholder values could cause runtime issues if not properly configured. |
||
"PARTITION_COLUMN": "ds", | ||
"PARTITION_FORMAT": "yyyy-MM-dd", | ||
"CUSTOMER_ID": "canary", | ||
"GCP_PROJECT_ID": "canary-443022", | ||
"GCP_REGION": "us-central1", | ||
"GCP_DATAPROC_CLUSTER_NAME": "canary-2", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no changes needed for the PR but I think at some point we'll want to hide some of these away from the customer (just making a note) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah i'm going to clean up the teams.json in the etsy directory https://github.com/etsy/zipline |
||
"GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance" | ||
}, | ||
"kaggle": { | ||
"description": "Workspace for kaggle compeitions", | ||
"namespace": "default" | ||
}, | ||
"quickstart": { | ||
"description": "Used for the quickstart example", | ||
"namespace": "default" | ||
"production": { | ||
"backfill": { | ||
"EXECUTOR_CORES": "1", | ||
david-zlai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"DRIVER_MEMORY": "15G", | ||
"EXECUTOR_MEMORY": "8G", | ||
"PARALLELISM": "4000", | ||
"MAX_EXECUTORS": "1000" | ||
}, | ||
"upload": { | ||
"EXECUTOR_CORES": "1", | ||
"EXECUTOR_MEMORY": "8G", | ||
"PARALLELISM": "1000", | ||
"MAX_EXECUTORS": "1000" | ||
}, | ||
"streaming": { | ||
"EXECUTOR_CORES": "2", | ||
"EXECUTOR_MEMORY": "4G", | ||
"PARALLELISM": "16" | ||
} | ||
} | ||
}, | ||
"sample_team": { | ||
"description": "Team description", | ||
"namespace": "chronon_db", | ||
"user": "# TODO: ldap user name to run the jobs as, from airflow or your own scheduler", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replace TODO with actual LDAP username. Placeholder LDAP username needs to be configured before deployment. |
||
"production": { | ||
"backfill": { | ||
"EXECUTOR_CORES": "4" | ||
} | ||
}, | ||
"risk": { | ||
"description": "Used for proof of concept", | ||
"namespace": "default" | ||
"dev": { | ||
"backfill": { | ||
"EXECUTOR_CORES": "2", | ||
"DRIVER_MEMORY": "30G" | ||
} | ||
} | ||
|
||
}, | ||
"kaggle": { | ||
"description": "Workspace for kaggle compeitions", | ||
"namespace": "default" | ||
}, | ||
"quickstart": { | ||
"description": "Used for the quickstart example", | ||
"namespace": "default" | ||
}, | ||
"risk": { | ||
"description": "Used for proof of concept", | ||
"namespace": "default" | ||
} | ||
} |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,9 +28,9 @@ class GcpApiImpl(conf: Map[String, String]) extends Api(conf) { | |
.getOrElse(throw new IllegalArgumentException("GCP_PROJECT_ID environment variable not set")) | ||
|
||
val instanceId = sys.env | ||
.get("GCP_INSTANCE_ID") | ||
.orElse(conf.get("GCP_INSTANCE_ID")) | ||
.getOrElse(throw new IllegalArgumentException("GCP_INSTANCE_ID environment variable not set")) | ||
.get("GCP_BIGTABLE_INSTANCE_ID") | ||
.orElse(conf.get("GCP_BIGTABLE_INSTANCE_ID")) | ||
.getOrElse(throw new IllegalArgumentException("GCP_BIGTABLE_INSTANCE_ID environment variable not set")) | ||
Comment on lines
+31
to
+33
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Codebase verification Environment variable rename is incomplete The variable name change from
🔗 Analysis chainLGTM! More descriptive environment variable name. The rename from GCP_INSTANCE_ID to GCP_BIGTABLE_INSTANCE_ID improves clarity. Let's verify the variable name is updated everywhere: 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Search for any remaining instances of the old variable name
rg "GCP_INSTANCE_ID"
# Search for the new variable name to ensure consistent usage
rg "GCP_BIGTABLE_INSTANCE_ID"
Length of output: 1835 |
||
|
||
// Create settings builder based on whether we're in emulator mode (e.g. docker) or not | ||
val (dataSettingsBuilder, adminSettingsBuilder, maybeBQClient) = sys.env.get("BIGTABLE_EMULATOR_HOST") match { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
change to
ZIPLINE_
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i can change. but
CHRONON_ONLINE_CLASS
is set throughout run.py like here https://github.com/zipline-ai/chronon/blob/main/api/py/ai/chronon/repo/run.py#L684still want to change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah got it okay maybe a followup.