Fix pandas large ints (#454)

genzgd · web-flow · commit 697794e2bb13 · 2025-01-14T04:58:53.000-07:00
* Fix pandas inserts into big int columns

* Tinkering with pandas insert data types
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,11 @@ release (0.9.0), unrecognized arguments/keywords for these methods of creating a
 instead of being passed as ClickHouse server settings. This is in conjunction with some refactoring in Client construction.
 The supported method of passing ClickHouse server settings is to prefix such arguments/query parameters with`ch_`.
 
+## 0.8.14, 2025-01-13
+### Bug Fix
+- Fix an edge case where a Pandas dataframe that contains _only_ Int64 (or smaller) values would cause an exception when
+inserting into a ClickHouse "big int" table of U/Int128/256.  Closes https://github.com/ClickHouse/clickhouse-connect/issues/452
+
 ## 0.8.13, 2025-01-07
 ### Bug Fix
 - Fix missing default for new access_token parameter.  Thanks to [Lukas Thaler](https://github.com/lukasthalerINNIO) for the PR.
diff --git a/clickhouse_connect/__version__.py b/clickhouse_connect/__version__.py
@@ -1 +1 @@
-version = '0.8.13'
+version = '0.8.14'
diff --git a/clickhouse_connect/datatypes/numeric.py b/clickhouse_connect/datatypes/numeric.py
@@ -98,6 +98,7 @@ def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence:
 class BigInt(ClickHouseType, registered=False):
     _signed = True
     valid_formats = 'string', 'native'
+    python_type = int
 
     def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryContext, _read_state: Any):
         signed = self._signed
diff --git a/clickhouse_connect/driver/insert.py b/clickhouse_connect/driver/insert.py
@@ -148,19 +148,20 @@ def _convert_pandas(self, df):
         data = []
         for df_col_name, col_name, ch_type in zip(df.columns, self.column_names, self.column_types):
             df_col = df[df_col_name]
-            d_type = str(df_col.dtype)
+            d_type_kind = df_col.dtype.kind
             if ch_type.python_type == int:
-                if 'float' in d_type:
+                if d_type_kind == 'f':
                     df_col = df_col.round().astype(ch_type.base_type, copy=False)
-                else:
-                    df_col = df_col.astype(ch_type.base_type, copy=False)
-            elif 'datetime' in ch_type.np_type and (pd_time_test(df_col) or 'datetime64[ns' in d_type):
+                elif d_type_kind in ('i', 'u') and not df_col.hasnans:
+                    data.append(df_col.to_list())
+                    continue
+            elif 'datetime' in ch_type.np_type and (pd_time_test(df_col) or 'datetime64[ns' in str(df_col.dtype)):
                 div = ch_type.nano_divisor
                 data.append([None if pd.isnull(x) else x.value // div for x in df_col])
                 self.column_formats[col_name] = 'int'
                 continue
             if ch_type.nullable:
-                if d_type == 'object':
+                if d_type_kind == 'O':
                     #  This is ugly, but the multiple replaces seem required as a result of this bug:
                     #  https://github.com/pandas-dev/pandas/issues/29024
                     df_col = df_col.replace({pd.NaT: None}).replace({np.nan: None})
diff --git a/tests/integration_tests/test_pandas.py b/tests/integration_tests/test_pandas.py
@@ -131,13 +131,14 @@ def test_pandas_low_card(test_client: Client, table_context: Callable):
 
 
 def test_pandas_large_types(test_client: Client, table_context: Callable):
-    columns = ['key String', 'value Int256']
+    columns = ['key String', 'value Int256', 'u_value UInt256'
+               ]
     key2_value = 30000000000000000000000000000000000
     if not test_client.min_version('21'):
         columns = ['key String', 'value Int64']
         key2_value = 3000000000000000000
     with table_context('test_pandas_big_int', columns):
-        df = pd.DataFrame([['key1', 2000], ['key2', key2_value]], columns=['key', 'value'])
+        df = pd.DataFrame([['key1', 2000, 50], ['key2', key2_value, 70], ['key3', -2350, 70]], columns=['key', 'value', 'u_value'])
         source_df = df.copy()
         test_client.insert_df('test_pandas_big_int', df)
         result_df = test_client.query_df('SELECT * FROM test_pandas_big_int')

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-version = '0.8.13'`
	`1`	`+version = '0.8.14'`