Skip to content

Commit 4f9af0f

Browse files
authored
Changed dataframe_from_result_table function (#538)
* Added dictionary options as parameters for type matching for dataframe_from_result_table func * reformatted the file * reformatted the file * reformatted the file * Fixes after PR * Fixes after PR * Fixes after PR * Fixes after PR * Fixes after PR * For python 3.7 3.8 * Nullable bools fix * Nullable bools fix * Nullable bools fix * Nullable bools fix2 * PR comment fixes * LRU cache default paramaters for python 3.7 * LRU cache maxsize=1 * modified changes requested * black
1 parent 2cfe180 commit 4f9af0f

File tree

6 files changed

+449
-228
lines changed

6 files changed

+449
-228
lines changed

azure-kusto-data/azure-kusto-data.iml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<module type="PYTHON_MODULE" version="4">
3+
<component name="NewModuleRootManager" inherit-compiler-output="true">
4+
<exclude-output />
5+
<content url="file://$MODULE_DIR$">
6+
<sourceFolder url="file://$MODULE_DIR$/azure" isTestSource="false" />
7+
</content>
8+
<orderEntry type="inheritedJdk" />
9+
<orderEntry type="sourceFolder" forTests="false" />
10+
</component>
11+
<component name="PackageRequirementsSettings">
12+
<option name="requirementsPath" value="" />
13+
</component>
14+
<component name="TestRunnerService">
15+
<option name="PROJECT_TEST_RUNNER" value="py.test" />
16+
</component>
17+
</module>
Lines changed: 99 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,54 @@
11
import sys
2-
from typing import TYPE_CHECKING, Union
2+
from typing import TYPE_CHECKING, Union, Callable, Dict, Optional
3+
from functools import lru_cache
4+
import copy
35

46
if TYPE_CHECKING:
5-
import pandas
7+
import pandas as pd
68
from azure.kusto.data._models import KustoResultTable, KustoStreamingResultTable
79

10+
# Alias for dataframe_from_result_table converter type
11+
Converter = Dict[str, Union[str, Callable[[str, "pd.DataFrame"], "pd.Series"]]]
812

9-
# Copyright (c) Microsoft Corporation.
10-
# Licensed under the MIT License
11-
def to_pandas_timedelta(raw_value: Union[int, float, str]) -> "pandas.Timedelta":
12-
"""
13-
Transform a raw python value to a pandas timedelta.
14-
"""
13+
14+
@lru_cache(maxsize=1, typed=False)
15+
def default_dict() -> Converter:
1516
import pandas as pd
1617

17-
if isinstance(raw_value, (int, float)):
18-
# https://docs.microsoft.com/en-us/dotnet/api/system.datetime.ticks
19-
# Kusto saves up to ticks, 1 tick == 100 nanoseconds
20-
return pd.to_timedelta(raw_value * 100, unit="ns")
21-
if isinstance(raw_value, str):
22-
# The timespan format Kusto returns is 'd.hh:mm:ss.ssssss' or 'hh:mm:ss.ssssss' or 'hh:mm:ss'
23-
# Pandas expects 'd days hh:mm:ss.ssssss' or 'hh:mm:ss.ssssss' or 'hh:mm:ss'
24-
parts = raw_value.split(":")
25-
if "." not in parts[0]:
26-
return pd.to_timedelta(raw_value)
27-
else:
28-
formatted_value = raw_value.replace(".", " days ", 1)
29-
return pd.to_timedelta(formatted_value)
18+
return {
19+
"string": lambda col, df: df[col].astype(pd.StringDtype()) if hasattr(pd, "StringDType") else df[col],
20+
"guid": lambda col, df: df[col],
21+
"dynamic": lambda col, df: df[col],
22+
"bool": lambda col, df: df[col].astype(bool),
23+
"int": lambda col, df: df[col].astype(pd.Int32Dtype()),
24+
"long": lambda col, df: df[col].astype(pd.Int64Dtype()),
25+
"real": lambda col, df: parse_float(df, col),
26+
"decimal": lambda col, df: parse_float(df, col),
27+
"datetime": lambda col, df: parse_datetime(df, col),
28+
"timespan": lambda col, df: df[col].apply(parse_timedelta),
29+
}
30+
3031

32+
# Copyright (c) Microsoft Corporation.
33+
# Licensed under the MIT License
3134

32-
def dataframe_from_result_table(table: "Union[KustoResultTable, KustoStreamingResultTable]", nullable_bools: bool = False) -> "pandas.DataFrame":
33-
"""Converts Kusto tables into pandas DataFrame.
35+
36+
def dataframe_from_result_table(
37+
table: "Union[KustoResultTable, KustoStreamingResultTable]",
38+
nullable_bools: bool = False,
39+
converters_by_type: Optional[Converter] = None,
40+
converters_by_column_name: Optional[Converter] = None,
41+
) -> "pd.DataFrame":
42+
f"""Converts Kusto tables into pandas DataFrame.
3443
:param azure.kusto.data._models.KustoResultTable table: Table received from the response.
3544
:param nullable_bools: When True, converts bools that are 'null' from kusto or 'None' from python to pandas.NA. This will be the default in the future.
45+
:param converters_by_type: If given, converts specified types to corresponding types, else uses {default_dict()}. The dictionary maps from kusto
46+
datatype (https://learn.microsoft.com/azure/data-explorer/kusto/query/scalar-data-types/) to a lambda that receives a column name and a dataframe and
47+
returns the converted column or to a string type name.
48+
:param converters_by_column_name: If given, converts specified columns to corresponding types, else uses converters_by_type. The dictionary maps from column
49+
name to a lambda that receives a column name and a dataframe and returns the converted column.
3650
:return: pandas DataFrame.
3751
"""
38-
import numpy as np
3952
import pandas as pd
4053

4154
if not table:
@@ -48,33 +61,23 @@ def dataframe_from_result_table(table: "Union[KustoResultTable, KustoStreamingRe
4861

4962
columns = [col.column_name for col in table.columns]
5063
frame = pd.DataFrame(table.raw_rows, columns=columns)
64+
default = default_dict()
5165

52-
# fix types
5366
for col in table.columns:
54-
if col.column_type == "string" and hasattr(pd, "StringDType"):
55-
frame[col.column_name] = frame[col.column_name].astype(pd.StringDType())
56-
if col.column_type == "bool":
57-
frame[col.column_name] = frame[col.column_name].astype(pd.BooleanDtype() if nullable_bools else bool)
58-
elif col.column_type == "int":
59-
frame[col.column_name] = frame[col.column_name].astype(pd.Int32Dtype())
60-
elif col.column_type == "long":
61-
frame[col.column_name] = frame[col.column_name].astype(pd.Int64Dtype())
62-
elif col.column_type == "real" or col.column_type == "decimal":
63-
frame[col.column_name] = frame[col.column_name].replace("NaN", np.NaN).replace("Infinity", np.PINF).replace("-Infinity", np.NINF)
64-
frame[col.column_name] = pd.to_numeric(frame[col.column_name], errors="coerce").astype(pd.Float64Dtype())
65-
elif col.column_type == "datetime":
66-
# Pandas before version 2 doesn't support the "format" arg
67-
args = {}
68-
if pd.__version__.startswith("2."):
69-
args = {"format": "ISO8601", "utc": True}
70-
else:
71-
# if frame contains ".", replace "Z" with ".000Z"
72-
# == False is not a mistake - that's the pandas way to do it
73-
contains_dot = frame[col.column_name].str.contains(".")
74-
frame.loc[contains_dot == False, col.column_name] = frame.loc[contains_dot == False, col.column_name].str.replace("Z", ".000Z")
75-
frame[col.column_name] = pd.to_datetime(frame[col.column_name], errors="coerce", **args)
76-
elif col.column_type == "timespan":
77-
frame[col.column_name] = frame[col.column_name].apply(to_pandas_timedelta)
67+
column_name = col.column_name
68+
column_type = col.column_type
69+
if converters_by_column_name and column_name in converters_by_column_name:
70+
converter = converters_by_column_name[column_name]
71+
elif converters_by_type and column_type in converters_by_type:
72+
converter = converters_by_type[column_type]
73+
elif nullable_bools and column_type == "bool":
74+
converter = lambda col, df: df[col].astype(pd.BooleanDtype())
75+
else:
76+
converter = default[column_type]
77+
if isinstance(converter, str):
78+
frame[column_name] = frame[column_name].astype(converter)
79+
else:
80+
frame[column_name] = converter(column_name, frame)
7881

7982
return frame
8083

@@ -87,3 +90,50 @@ def get_string_tail_lower_case(val, length):
8790
return val.lower()
8891

8992
return val[len(val) - length :].lower()
93+
94+
95+
# TODO When moving to pandas 2 only - change to the appropriate type
96+
def parse_float(frame, col):
97+
import numpy as np
98+
import pandas as pd
99+
100+
frame[col] = frame[col].replace("NaN", np.NaN).replace("Infinity", np.PINF).replace("-Infinity", np.NINF)
101+
frame[col] = pd.to_numeric(frame[col], errors="coerce").astype(pd.Float64Dtype())
102+
return frame[col]
103+
104+
105+
def parse_datetime(frame, col):
106+
# Pandas before version 2 doesn't support the "format" arg
107+
import pandas as pd
108+
109+
args = {}
110+
if pd.__version__.startswith("2."):
111+
args = {"format": "ISO8601", "utc": True}
112+
else:
113+
# if frame contains ".", replace "Z" with ".000Z"
114+
# == False is not a mistake - that's the pandas way to do it
115+
contains_dot = frame[col].str.contains(".")
116+
frame.loc[contains_dot == False, col] = frame.loc[contains_dot == False, col].str.replace("Z", ".000Z")
117+
frame[col] = pd.to_datetime(frame[col], errors="coerce", **args)
118+
return frame[col]
119+
120+
121+
def parse_timedelta(raw_value: Union[int, float, str]) -> "pd.Timedelta":
122+
"""
123+
Transform a raw python value to a pandas timedelta.
124+
"""
125+
import pandas as pd
126+
127+
if isinstance(raw_value, (int, float)):
128+
# https://docs.microsoft.com/en-us/dotnet/api/system.datetime.ticks
129+
# Kusto saves up to ticks, 1 tick == 100 nanoseconds
130+
return pd.to_timedelta(raw_value * 100, unit="ns")
131+
if isinstance(raw_value, str):
132+
# The timespan format Kusto returns is 'd.hh:mm:ss.ssssss' or 'hh:mm:ss.ssssss' or 'hh:mm:ss'
133+
# Pandas expects 'd days hh:mm:ss.ssssss' or 'hh:mm:ss.ssssss' or 'hh:mm:ss'
134+
parts = raw_value.split(":")
135+
if "." not in parts[0]:
136+
return pd.to_timedelta(raw_value)
137+
else:
138+
formatted_value = raw_value.replace(".", " days ", 1)
139+
return pd.to_timedelta(formatted_value)

0 commit comments

Comments
 (0)