diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 5624df3b8d..7da0881bbe 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -106,6 +106,7 @@ for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \ # write access to COVERAGE_TABLE=bigframes-metrics.coverage_report.bigframes_coverage_nightly python3.10 scripts/publish_api_coverage.py \ + bigquery \ --bigframes_version=$BIGFRAMES_VERSION \ --release_version=$RELEASE_VERSION \ --bigquery_table=$COVERAGE_TABLE diff --git a/docs/index.rst b/docs/index.rst index d239ea3a78..b17ac7cbd9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,6 +7,7 @@ API reference :maxdepth: 3 reference/index + supported_pandas_apis Changelog --------- diff --git a/docs/supported_pandas_apis.rst b/docs/supported_pandas_apis.rst new file mode 100644 index 0000000000..f4b57f05d1 --- /dev/null +++ b/docs/supported_pandas_apis.rst @@ -0,0 +1,62 @@ +Supported pandas APIs +===================== + +The following tables show the pandas APIs that have been implemented (or not) +in BigQuery DataFrames. + +* 'Y' means it implements all parameters. +* 'P' means it implements only some parameters. + +DataFrame +--------- + +.. raw:: html + :file: supported_pandas_apis/bf_dataframe.html + +DataFrameGroupBy +---------------- + +.. raw:: html + :file: supported_pandas_apis/bf_dataframegroupby.html + +Index +----- + +.. raw:: html + :file: supported_pandas_apis/bf_index.html + +pandas module +------------- + +.. raw:: html + :file: supported_pandas_apis/bf_pandas.html + +Series +------ + +.. raw:: html + :file: supported_pandas_apis/bf_series.html + +Series.dt methods +----------------- + +.. raw:: html + :file: supported_pandas_apis/bf_datetimemethods.html + +Series.str methods +------------------ + +.. raw:: html + :file: supported_pandas_apis/bf_stringmethods.html + +SeriesGroupBy +------------- + +.. raw:: html + :file: supported_pandas_apis/bf_seriesgroupby.html + +Window +------ + +.. raw:: html + :file: supported_pandas_apis/bf_window.html diff --git a/docs/supported_pandas_apis/.gitignore b/docs/supported_pandas_apis/.gitignore new file mode 100644 index 0000000000..2d19fc766d --- /dev/null +++ b/docs/supported_pandas_apis/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index c07e6141f1..57b0522d04 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -72,6 +72,8 @@ name: Series - name: Window uid: bigframes.core.window.Window + - href: supported_pandas_apis.html + name: Supported pandas APIs name: bigframes.pandas - items: - items: diff --git a/noxfile.py b/noxfile.py index a5e77964f1..4ac3a81723 100644 --- a/noxfile.py +++ b/noxfile.py @@ -467,6 +467,12 @@ def docs(session): ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + + session.run( + "python", + "scripts/publish_api_coverage.py", + "docs", + ) session.run( "sphinx-build", "-W", # warnings as errors @@ -503,6 +509,12 @@ def docfx(session): ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + + session.run( + "python", + "scripts/publish_api_coverage.py", + "docs", + ) session.run( "sphinx-build", "-T", # show full traceback on exception diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 856307e440..4a35ade9ef 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -17,27 +17,110 @@ import argparse import inspect +import pathlib +import sys import pandas as pd +import pandas.core.groupby +import pandas.core.indexes.accessors +import pandas.core.strings.accessor +import pandas.core.window.rolling +import bigframes +import bigframes.core.groupby +import bigframes.core.window +import bigframes.operations.datetimes import bigframes.pandas as bpd +REPO_ROOT = pathlib.Path(__file__).parent.parent + +URL_PREFIX = { + "pandas": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_" + ), + "dataframe": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_" + ), + "dataframegroupby": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.DataFrameGroupBy#bigframes_core_groupby_DataFrameGroupBy_" + ), + "series": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_" + ), + "seriesgroupby": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.SeriesGroupBy#bigframes_core_groupby_SeriesGroupBy_" + ), + "datetimemethods": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.datetimes.DatetimeMethods#bigframes_operations_datetimes_DatetimeMethods_" + ), + "stringmethods": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.strings.StringMethods#bigframes_operations_strings_StringMethods_" + ), + "window": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.window.Window#bigframes_core_window_Window_" + ), + # TODO: Index not documented. +} + + +PANDAS_TARGETS = [ + ("pandas", pd, bpd), + ("dataframe", pd.DataFrame, bpd.DataFrame), + ( + "dataframegroupby", + pandas.core.groupby.DataFrameGroupBy, + bigframes.core.groupby.DataFrameGroupBy, + ), + ("series", pd.Series, bpd.Series), + ( + "seriesgroupby", + pandas.core.groupby.DataFrameGroupBy, + bigframes.core.groupby.DataFrameGroupBy, + ), + ( + "datetimemethods", + pandas.core.indexes.accessors.CombinedDatetimelikeProperties, + bigframes.operations.datetimes.DatetimeMethods, + ), + ( + "stringmethods", + pandas.core.strings.accessor.StringMethods, + bigframes.operations.strings.StringMethods, + ), + ( + "window", + pandas.core.window.rolling.Rolling, + bigframes.core.window.Window, + ), + ("index", pd.Index, bpd.Index), +] + + +def names_from_signature(signature): + """Extract the names of parameters from signature + + See: https://docs.python.org/3/library/inspect.html#inspect.signature + """ + return frozenset({parameter for parameter in signature.parameters}) + + +def calculate_missing_parameters(bigframes_function, target_function): + bigframes_params = names_from_signature(inspect.signature(bigframes_function)) + target_params = names_from_signature(inspect.signature(target_function)) + return target_params - bigframes_params + def generate_pandas_api_coverage(): """Inspect all our pandas objects, and compare with the real pandas objects, to see which methods we implement. For each, generate a regex that can be used to check if its present in a notebook""" - header = ["api", "pattern", "kind", "is_in_bigframes"] + header = ["api", "pattern", "kind", "is_in_bigframes", "missing_parameters"] api_patterns = [] - targets = [ - ("pandas", pd, bpd), - ("dataframe", pd.DataFrame, bpd.DataFrame), - ("series", pd.Series, bpd.Series), - ("index", pd.Index, bpd.Index), - ] indexers = ["loc", "iloc", "iat", "ix", "at"] - for name, pandas_obj, bigframes_obj in targets: + for name, pandas_obj, bigframes_obj in PANDAS_TARGETS: for member in dir(pandas_obj): + missing_parameters = "" + # skip private functions and properties if member[0] == "_" and member[1] != "_": continue @@ -50,6 +133,17 @@ def generate_pandas_api_coverage(): # Function, match .member( token = f"\\.{member}\\(" token_type = "function" + + if hasattr(bigframes_obj, member): + bigframes_function = getattr(bigframes_obj, member) + pandas_function = getattr(pandas_obj, member) + missing_parameters = ", ".join( + sorted( + calculate_missing_parameters( + bigframes_function, pandas_function + ) + ) + ) elif member in indexers: # Indexer, match .indexer[ token = f"\\.{member}\\[" @@ -62,7 +156,13 @@ def generate_pandas_api_coverage(): is_in_bigframes = hasattr(bigframes_obj, member) api_patterns.append( - [f"{name}.{member}", token, token_type, is_in_bigframes] + [ + f"{name}.{member}", + token, + token_type, + is_in_bigframes, + missing_parameters, + ] ) return pd.DataFrame(api_patterns, columns=header) @@ -165,14 +265,112 @@ def build_api_coverage_table(bigframes_version: str, release_version: str): return combined_df.infer_objects().convert_dtypes() +def format_api(api_names, is_in_bigframes, api_prefix): + api_names = api_names.str.slice(start=len(f"{api_prefix}.")) + formatted = "" + api_names + "" + url_prefix = URL_PREFIX.get(api_prefix) + if url_prefix is None: + return formatted + + linked = '' + formatted + "" + return formatted.mask(is_in_bigframes, linked) + + +def generate_api_coverage(df, api_prefix): + dataframe_apis = df.loc[df["api"].str.startswith(f"{api_prefix}.")] + fully_implemented = ( + dataframe_apis["missing_parameters"].str.len() == 0 + ) & dataframe_apis["is_in_bigframes"] + partial_implemented = ( + dataframe_apis["missing_parameters"].str.len() != 0 + ) & dataframe_apis["is_in_bigframes"] + not_implemented = ~dataframe_apis["is_in_bigframes"] + dataframe_table = pd.DataFrame( + { + "API": format_api( + dataframe_apis["api"], + dataframe_apis["is_in_bigframes"], + api_prefix, + ), + "Implemented": "", + "Missing parameters": dataframe_apis["missing_parameters"], + } + ) + dataframe_table.loc[fully_implemented, "Implemented"] = "Y" + dataframe_table.loc[partial_implemented, "Implemented"] = "P" + dataframe_table.loc[not_implemented, "Implemented"] = "N" + return dataframe_table + + +def generate_api_coverage_doc(df, api_prefix): + dataframe_table = generate_api_coverage(df, api_prefix) + dataframe_table = dataframe_table.loc[~(dataframe_table["Implemented"] == "N")] + dataframe_table["Implemented"] = dataframe_table["Implemented"].map( + { + "Y": "Y", + "P": "P", + } + ) + + with open( + REPO_ROOT / "docs" / "supported_pandas_apis" / f"bf_{api_prefix}.html", + "w", + ) as html_file: + dataframe_table.to_html( + html_file, index=False, header=True, escape=False, border=0, col_space="8em" + ) + + +def generate_api_coverage_docs(df): + for target in PANDAS_TARGETS: + api_prefix = target[0] + generate_api_coverage_doc(df, api_prefix) + + +def print_api_coverage_summary(df, api_prefix): + dataframe_table = generate_api_coverage(df, api_prefix) + + print(api_prefix) + print(dataframe_table[["Implemented", "API"]].groupby(["Implemented"]).count()) + print(f"{api_prefix} APIs: {dataframe_table.shape[0]}\n") + + +def print_api_coverage_summaries(df): + for target in PANDAS_TARGETS: + api_prefix = target[0] + print_api_coverage_summary(df, api_prefix) + + print(f"\nAll APIs: {len(df.index)}") + fully_implemented = (df["missing_parameters"].str.len() == 0) & df[ + "is_in_bigframes" + ] + print(f"Y: {fully_implemented.sum()}") + partial_implemented = (df["missing_parameters"].str.len() != 0) & df[ + "is_in_bigframes" + ] + print(f"P: {partial_implemented.sum()}") + not_implemented = ~df["is_in_bigframes"] + print(f"N: {not_implemented.sum()}") + + def main(): parser = argparse.ArgumentParser() - parser.add_argument("--bigframes_version") - parser.add_argument("--release_version") + parser.add_argument("output_type") + parser.add_argument("--bigframes_version", default=bigframes.__version__) + parser.add_argument("--release_version", default="") parser.add_argument("--bigquery_table_name") args = parser.parse_args() df = build_api_coverage_table(args.bigframes_version, args.release_version) - df.to_gbq(args.bigquery_table_name, if_exists="append") + + if args.output_type == "bigquery": + df.to_gbq(args.bigquery_table_name, if_exists="append") + elif args.output_type == "docs": + generate_api_coverage_docs(df) + elif args.output_type == "summary": + print_api_coverage_summaries(df) + else: + print(f"Unexpected output_type {repr(args.output_type)}") + sys.exit(1) if __name__ == "__main__": diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 96b2d1bb48..061cc1c25c 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -27,6 +27,7 @@ def test_api_coverage_produces_expected_schema(): "string", "boolean", "string", + "string", "datetime64[ns]", "string", "string", @@ -36,6 +37,7 @@ def test_api_coverage_produces_expected_schema(): "pattern", "kind", "is_in_bigframes", + "missing_parameters", "module", "timestamp", "bigframes_version",