Skip to content

Commit 697794e

Browse files
authored
Fix pandas large ints (#454)
* Fix pandas inserts into big int columns * Tinkering with pandas insert data types
1 parent 8c94d92 commit 697794e

File tree

5 files changed

+17
-9
lines changed

5 files changed

+17
-9
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ release (0.9.0), unrecognized arguments/keywords for these methods of creating a
1717
instead of being passed as ClickHouse server settings. This is in conjunction with some refactoring in Client construction.
1818
The supported method of passing ClickHouse server settings is to prefix such arguments/query parameters with`ch_`.
1919

20+
## 0.8.14, 2025-01-13
21+
### Bug Fix
22+
- Fix an edge case where a Pandas dataframe that contains _only_ Int64 (or smaller) values would cause an exception when
23+
inserting into a ClickHouse "big int" table of U/Int128/256. Closes https://github.com/ClickHouse/clickhouse-connect/issues/452
24+
2025
## 0.8.13, 2025-01-07
2126
### Bug Fix
2227
- Fix missing default for new access_token parameter. Thanks to [Lukas Thaler](https://github.com/lukasthalerINNIO) for the PR.

clickhouse_connect/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
version = '0.8.13'
1+
version = '0.8.14'

clickhouse_connect/datatypes/numeric.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence:
9898
class BigInt(ClickHouseType, registered=False):
9999
_signed = True
100100
valid_formats = 'string', 'native'
101+
python_type = int
101102

102103
def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryContext, _read_state: Any):
103104
signed = self._signed

clickhouse_connect/driver/insert.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,19 +148,20 @@ def _convert_pandas(self, df):
148148
data = []
149149
for df_col_name, col_name, ch_type in zip(df.columns, self.column_names, self.column_types):
150150
df_col = df[df_col_name]
151-
d_type = str(df_col.dtype)
151+
d_type_kind = df_col.dtype.kind
152152
if ch_type.python_type == int:
153-
if 'float' in d_type:
153+
if d_type_kind == 'f':
154154
df_col = df_col.round().astype(ch_type.base_type, copy=False)
155-
else:
156-
df_col = df_col.astype(ch_type.base_type, copy=False)
157-
elif 'datetime' in ch_type.np_type and (pd_time_test(df_col) or 'datetime64[ns' in d_type):
155+
elif d_type_kind in ('i', 'u') and not df_col.hasnans:
156+
data.append(df_col.to_list())
157+
continue
158+
elif 'datetime' in ch_type.np_type and (pd_time_test(df_col) or 'datetime64[ns' in str(df_col.dtype)):
158159
div = ch_type.nano_divisor
159160
data.append([None if pd.isnull(x) else x.value // div for x in df_col])
160161
self.column_formats[col_name] = 'int'
161162
continue
162163
if ch_type.nullable:
163-
if d_type == 'object':
164+
if d_type_kind == 'O':
164165
# This is ugly, but the multiple replaces seem required as a result of this bug:
165166
# https://github.com/pandas-dev/pandas/issues/29024
166167
df_col = df_col.replace({pd.NaT: None}).replace({np.nan: None})

tests/integration_tests/test_pandas.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,13 +131,14 @@ def test_pandas_low_card(test_client: Client, table_context: Callable):
131131

132132

133133
def test_pandas_large_types(test_client: Client, table_context: Callable):
134-
columns = ['key String', 'value Int256']
134+
columns = ['key String', 'value Int256', 'u_value UInt256'
135+
]
135136
key2_value = 30000000000000000000000000000000000
136137
if not test_client.min_version('21'):
137138
columns = ['key String', 'value Int64']
138139
key2_value = 3000000000000000000
139140
with table_context('test_pandas_big_int', columns):
140-
df = pd.DataFrame([['key1', 2000], ['key2', key2_value]], columns=['key', 'value'])
141+
df = pd.DataFrame([['key1', 2000, 50], ['key2', key2_value, 70], ['key3', -2350, 70]], columns=['key', 'value', 'u_value'])
141142
source_df = df.copy()
142143
test_client.insert_df('test_pandas_big_int', df)
143144
result_df = test_client.query_df('SELECT * FROM test_pandas_big_int')

0 commit comments

Comments
 (0)