Skip to content

SNOW-2019088: Extend write_pandas by a parameter for schema inference #2250

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
# Release Notes
- v3.16(TBD)
- Added basic arrow support for Interval types.
- Added `infer_schema` parameter to `write_pandas` to perform schema inference on the passed data.
- Fix `write_pandas` special characters usage in the location name.

- v3.15.0(Apr 29,2025)
Expand Down
49 changes: 27 additions & 22 deletions src/snowflake/connector/pandas_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ def write_pandas(
on_error: str = "abort_statement",
parallel: int = 4,
quote_identifiers: bool = True,
infer_schema: bool = False,
auto_create_table: bool = False,
create_temp_table: bool = False,
overwrite: bool = False,
Expand Down Expand Up @@ -321,6 +322,8 @@ def write_pandas(
quote_identifiers: By default, identifiers, specifically database, schema, table and column names
(from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting.
I.e. identifiers will be coerced to uppercase by Snowflake. (Default value = True)
infer_schema: Perform explicit schema inference on the data in the DataFrame and use the inferred data types
when selecting columns from the DataFrame. (Default value = False)
auto_create_table: When true, will automatically create a table with corresponding columns for each column in
the passed in DataFrame. The table will not be created if it already exists
create_temp_table: (Deprecated) Will make the auto-created table as a temporary table
Expand Down Expand Up @@ -487,7 +490,7 @@ def drop_object(name: str, object_type: str) -> None:
num_statements=1,
)

if auto_create_table or overwrite:
if auto_create_table or overwrite or infer_schema:
file_format_location = _create_temp_file_format(
cursor,
database,
Expand Down Expand Up @@ -526,27 +529,29 @@ def drop_object(name: str, object_type: str) -> None:
quote_identifiers,
)

iceberg = "ICEBERG " if iceberg_config else ""
iceberg_config_statement = _iceberg_config_statement_helper(
iceberg_config or {}
)
if auto_create_table or overwrite:
iceberg = "ICEBERG " if iceberg_config else ""
iceberg_config_statement = _iceberg_config_statement_helper(
iceberg_config or {}
)

create_table_sql = (
f"CREATE {table_type.upper()} {iceberg}TABLE IF NOT EXISTS identifier(?) "
f"({create_table_columns}) {iceberg_config_statement}"
f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
)
params = (target_table_location,)
logger.debug(
f"auto creating table with '{create_table_sql}'. params: %s", params
)
cursor.execute(
create_table_sql,
_is_internal=True,
_force_qmark_paramstyle=True,
params=params,
num_statements=1,
)

create_table_sql = (
f"CREATE {table_type.upper()} {iceberg}TABLE IF NOT EXISTS identifier(?) "
f"({create_table_columns}) {iceberg_config_statement}"
f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
)
params = (target_table_location,)
logger.debug(
f"auto creating table with '{create_table_sql}'. params: %s", params
)
cursor.execute(
create_table_sql,
_is_internal=True,
_force_qmark_paramstyle=True,
params=params,
num_statements=1,
)
# need explicit casting when the underlying table schema is inferred
parquet_columns = "$1:" + ",$1:".join(
f"{quote}{snowflake_col}{quote}::{column_type_mapping[col]}"
Expand Down Expand Up @@ -584,7 +589,7 @@ def drop_object(name: str, object_type: str) -> None:
f"FILE_FORMAT=("
f"TYPE=PARQUET "
f"COMPRESSION={compression_map[compression]}"
f"{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite else ''}"
f"{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite or infer_schema else ''}"
f"{sql_use_logical_type}"
f") "
f"PURGE=TRUE ON_ERROR=?"
Expand Down