Skip to content

docs: add samples for ArimiaPlus time_series_id_col feature #1577

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,6 @@ coverage.xml
system_tests/local_test_setup

# Make sure a generated file isn't accidentally committed.
demo.ipynb
pylintrc
pylintrc.test
7 changes: 7 additions & 0 deletions owlbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@
# Fixup files
# ----------------------------------------------------------------------------

# Add scratch space for experimentation to .gitignore.
assert 1 == s.replace(
[".gitignore"],
re.escape("# Make sure a generated file isn't accidentally committed.\n"),
"# Make sure a generated file isn't accidentally committed.\ndemo.ipynb\n",
)

# Encourage sharring all relevant versions in bug reports.
assert 1 == s.replace( # bug_report.md
[".github/ISSUE_TEMPLATE/bug_report.md"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,26 +73,91 @@ def test_multiple_timeseries_forecasting_model(random_model_id: str) -> None:
from bigframes.ml import forecasting
import bigframes.pandas as bpd

model = forecasting.ARIMAPlus(
# To reduce the query runtime with the compromise of a potential slight
# drop in model quality, you could decrease the value of the
# auto_arima_max_order. This shrinks the search space of hyperparameter
# tuning in the auto.ARIMA algorithm.
auto_arima_max_order=5,
)

df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips")

# This query creates twelve time series models, one for each of the twelve
# Citi Bike start stations in the input data. If you remove this row
# filter, there would be 600+ time series to forecast.
df = df[df["start_station_name"].str.contains("Central Park")]

features = bpd.DataFrame(
{
"num_trips": df.starttime,
"start_station_name": df["start_station_name"],
"num_trips": df["starttime"],
"date": df["starttime"].dt.date,
}
)
num_trips = features.groupby(["date"], as_index=False).count()
model = forecasting.ARIMAPlus()
num_trips = features.groupby(
["start_station_name", "date"],
as_index=False,
).count()

X = num_trips["date"].to_frame()
y = num_trips["num_trips"].to_frame()

model.fit(X, y)
model.fit(
X,
y,
# The input data that you want to get forecasts for,
# in this case the Citi Bike station, as represented by the
# start_station_name column.
id_col=num_trips["start_station_name"].to_frame(),
)

# The model.fit() call above created a temporary model.
# Use the to_gbq() method to write to a permanent location.

model.to_gbq(
your_model_id, # For example: "bqml_tutorial.nyc_citibike_arima_model",
replace=True,
)
# [END bigquery_dataframes_bqml_arima_multiple_step_3_fit]

# [START bigquery_dataframes_bqml_arima_multiple_step_4_evaluate]
# Evaluate the time series models by using the summary() function. The summary()
# function shows you the evaluation metrics of all the candidate models evaluated
# during the process of automatic hyperparameter tuning.
summary = model.summary()
print(summary.peek())

# Expected output:
# start_station_name non_seasonal_p non_seasonal_d non_seasonal_q has_drift log_likelihood AIC variance ...
# 1 Central Park West & W 72 St 0 1 5 False -1966.449243 3944.898487 1215.689281 ...
# 8 Central Park W & W 96 St 0 0 5 False -274.459923 562.919847 655.776577 ...
# 9 Central Park West & W 102 St 0 0 0 False -226.639918 457.279835 258.83582 ...
# 11 Central Park West & W 76 St 1 1 2 False -1700.456924 3408.913848 383.254161 ...
# 4 Grand Army Plaza & Central Park S 0 1 5 False -5507.553498 11027.106996 624.138741 ...
# [END bigquery_dataframes_bqml_arima_multiple_step_4_evaluate]

# [START bigquery_dataframes_bqml_arima_multiple_step_5_coefficients]
coef = model.coef_
print(coef.peek())

# Expected output:
# start_station_name ar_coefficients ma_coefficients intercept_or_drift
# 5 Central Park West & W 68 St [] [-0.41014089 0.21979212 -0.59854213 -0.251438... 0.0
# 6 Central Park S & 6 Ave [] [-0.71488957 -0.36835772 0.61008532 0.183290... 0.0
# 0 Central Park West & W 85 St [] [-0.39270166 -0.74494638 0.76432596 0.489146... 0.0
# 3 W 82 St & Central Park West [-0.50219511 -0.64820817] [-0.20665325 0.67683137 -0.68108631] 0.0
# 11 W 106 St & Central Park West [-0.70442887 -0.66885553 -0.25030325 -0.34160669] [] 0.0
# [END bigquery_dataframes_bqml_arima_multiple_step_5_coefficients]

# [START bigquery_dataframes_bqml_arima_multiple_step_6_forecast]
prediction = model.predict(horizon=3, confidence_level=0.9)

print(prediction.peek())
# Expected output:
# forecast_timestamp start_station_name forecast_value standard_error confidence_level ...
# 4 2016-10-01 00:00:00+00:00 Central Park S & 6 Ave 302.377201 32.572948 0.9 ...
# 14 2016-10-02 00:00:00+00:00 Central Park North & Adam Clayton Powell Blvd 263.917567 45.284082 0.9 ...
# 1 2016-09-25 00:00:00+00:00 Central Park West & W 85 St 189.574706 39.874856 0.9 ...
# 20 2016-10-02 00:00:00+00:00 Central Park West & W 72 St 175.474862 40.940794 0.9 ...
# 12 2016-10-01 00:00:00+00:00 W 106 St & Central Park West 63.88163 18.088868 0.9 ...
# [END bigquery_dataframes_bqml_arima_multiple_step_6_forecast]