1
1
import sys
2
- from typing import TYPE_CHECKING , Union
2
+ from typing import TYPE_CHECKING , Union , Callable , Dict , Optional
3
+ from functools import lru_cache
4
+ import copy
3
5
4
6
if TYPE_CHECKING :
5
- import pandas
7
+ import pandas as pd
6
8
from azure .kusto .data ._models import KustoResultTable , KustoStreamingResultTable
7
9
10
+ # Alias for dataframe_from_result_table converter type
11
+ Converter = Dict [str , Union [str , Callable [[str , "pd.DataFrame" ], "pd.Series" ]]]
8
12
9
- # Copyright (c) Microsoft Corporation.
10
- # Licensed under the MIT License
11
- def to_pandas_timedelta (raw_value : Union [int , float , str ]) -> "pandas.Timedelta" :
12
- """
13
- Transform a raw python value to a pandas timedelta.
14
- """
13
+
14
+ @lru_cache (maxsize = 1 , typed = False )
15
+ def default_dict () -> Converter :
15
16
import pandas as pd
16
17
17
- if isinstance ( raw_value , ( int , float )):
18
- # https://docs.microsoft.com/en-us/dotnet/api/system.datetime.ticks
19
- # Kusto saves up to ticks, 1 tick == 100 nanoseconds
20
- return pd . to_timedelta ( raw_value * 100 , unit = "ns" )
21
- if isinstance ( raw_value , str ):
22
- # The timespan format Kusto returns is 'd.hh:mm:ss.ssssss' or 'hh:mm:ss.ssssss' or 'hh:mm:ss'
23
- # Pandas expects 'd days hh:mm:ss.ssssss' or 'hh:mm:ss.ssssss' or 'hh:mm:ss'
24
- parts = raw_value . split ( ":" )
25
- if "." not in parts [ 0 ]:
26
- return pd . to_timedelta ( raw_value )
27
- else :
28
- formatted_value = raw_value . replace ( "." , " days " , 1 )
29
- return pd . to_timedelta ( formatted_value )
18
+ return {
19
+ "string" : lambda col , df : df [ col ]. astype ( pd . StringDtype ()) if hasattr ( pd , "StringDType" ) else df [ col ],
20
+ "guid" : lambda col , df : df [ col ],
21
+ "dynamic" : lambda col , df : df [ col ],
22
+ "bool" : lambda col , df : df [ col ]. astype ( bool ),
23
+ "int" : lambda col , df : df [ col ]. astype ( pd . Int32Dtype ()),
24
+ "long" : lambda col , df : df [ col ]. astype ( pd . Int64Dtype ()),
25
+ "real" : lambda col , df : parse_float ( df , col ),
26
+ "decimal" : lambda col , df : parse_float ( df , col ),
27
+ "datetime" : lambda col , df : parse_datetime ( df , col ),
28
+ "timespan" : lambda col , df : df [ col ]. apply ( parse_timedelta ),
29
+ }
30
+
30
31
32
+ # Copyright (c) Microsoft Corporation.
33
+ # Licensed under the MIT License
31
34
32
- def dataframe_from_result_table (table : "Union[KustoResultTable, KustoStreamingResultTable]" , nullable_bools : bool = False ) -> "pandas.DataFrame" :
33
- """Converts Kusto tables into pandas DataFrame.
35
+
36
+ def dataframe_from_result_table (
37
+ table : "Union[KustoResultTable, KustoStreamingResultTable]" ,
38
+ nullable_bools : bool = False ,
39
+ converters_by_type : Optional [Converter ] = None ,
40
+ converters_by_column_name : Optional [Converter ] = None ,
41
+ ) -> "pd.DataFrame" :
42
+ f"""Converts Kusto tables into pandas DataFrame.
34
43
:param azure.kusto.data._models.KustoResultTable table: Table received from the response.
35
44
:param nullable_bools: When True, converts bools that are 'null' from kusto or 'None' from python to pandas.NA. This will be the default in the future.
45
+ :param converters_by_type: If given, converts specified types to corresponding types, else uses { default_dict ()} . The dictionary maps from kusto
46
+ datatype (https://learn.microsoft.com/azure/data-explorer/kusto/query/scalar-data-types/) to a lambda that receives a column name and a dataframe and
47
+ returns the converted column or to a string type name.
48
+ :param converters_by_column_name: If given, converts specified columns to corresponding types, else uses converters_by_type. The dictionary maps from column
49
+ name to a lambda that receives a column name and a dataframe and returns the converted column.
36
50
:return: pandas DataFrame.
37
51
"""
38
- import numpy as np
39
52
import pandas as pd
40
53
41
54
if not table :
@@ -48,33 +61,23 @@ def dataframe_from_result_table(table: "Union[KustoResultTable, KustoStreamingRe
48
61
49
62
columns = [col .column_name for col in table .columns ]
50
63
frame = pd .DataFrame (table .raw_rows , columns = columns )
64
+ default = default_dict ()
51
65
52
- # fix types
53
66
for col in table .columns :
54
- if col .column_type == "string" and hasattr (pd , "StringDType" ):
55
- frame [col .column_name ] = frame [col .column_name ].astype (pd .StringDType ())
56
- if col .column_type == "bool" :
57
- frame [col .column_name ] = frame [col .column_name ].astype (pd .BooleanDtype () if nullable_bools else bool )
58
- elif col .column_type == "int" :
59
- frame [col .column_name ] = frame [col .column_name ].astype (pd .Int32Dtype ())
60
- elif col .column_type == "long" :
61
- frame [col .column_name ] = frame [col .column_name ].astype (pd .Int64Dtype ())
62
- elif col .column_type == "real" or col .column_type == "decimal" :
63
- frame [col .column_name ] = frame [col .column_name ].replace ("NaN" , np .NaN ).replace ("Infinity" , np .PINF ).replace ("-Infinity" , np .NINF )
64
- frame [col .column_name ] = pd .to_numeric (frame [col .column_name ], errors = "coerce" ).astype (pd .Float64Dtype ())
65
- elif col .column_type == "datetime" :
66
- # Pandas before version 2 doesn't support the "format" arg
67
- args = {}
68
- if pd .__version__ .startswith ("2." ):
69
- args = {"format" : "ISO8601" , "utc" : True }
70
- else :
71
- # if frame contains ".", replace "Z" with ".000Z"
72
- # == False is not a mistake - that's the pandas way to do it
73
- contains_dot = frame [col .column_name ].str .contains ("." )
74
- frame .loc [contains_dot == False , col .column_name ] = frame .loc [contains_dot == False , col .column_name ].str .replace ("Z" , ".000Z" )
75
- frame [col .column_name ] = pd .to_datetime (frame [col .column_name ], errors = "coerce" , ** args )
76
- elif col .column_type == "timespan" :
77
- frame [col .column_name ] = frame [col .column_name ].apply (to_pandas_timedelta )
67
+ column_name = col .column_name
68
+ column_type = col .column_type
69
+ if converters_by_column_name and column_name in converters_by_column_name :
70
+ converter = converters_by_column_name [column_name ]
71
+ elif converters_by_type and column_type in converters_by_type :
72
+ converter = converters_by_type [column_type ]
73
+ elif nullable_bools and column_type == "bool" :
74
+ converter = lambda col , df : df [col ].astype (pd .BooleanDtype ())
75
+ else :
76
+ converter = default [column_type ]
77
+ if isinstance (converter , str ):
78
+ frame [column_name ] = frame [column_name ].astype (converter )
79
+ else :
80
+ frame [column_name ] = converter (column_name , frame )
78
81
79
82
return frame
80
83
@@ -87,3 +90,50 @@ def get_string_tail_lower_case(val, length):
87
90
return val .lower ()
88
91
89
92
return val [len (val ) - length :].lower ()
93
+
94
+
95
+ # TODO When moving to pandas 2 only - change to the appropriate type
96
+ def parse_float (frame , col ):
97
+ import numpy as np
98
+ import pandas as pd
99
+
100
+ frame [col ] = frame [col ].replace ("NaN" , np .NaN ).replace ("Infinity" , np .PINF ).replace ("-Infinity" , np .NINF )
101
+ frame [col ] = pd .to_numeric (frame [col ], errors = "coerce" ).astype (pd .Float64Dtype ())
102
+ return frame [col ]
103
+
104
+
105
+ def parse_datetime (frame , col ):
106
+ # Pandas before version 2 doesn't support the "format" arg
107
+ import pandas as pd
108
+
109
+ args = {}
110
+ if pd .__version__ .startswith ("2." ):
111
+ args = {"format" : "ISO8601" , "utc" : True }
112
+ else :
113
+ # if frame contains ".", replace "Z" with ".000Z"
114
+ # == False is not a mistake - that's the pandas way to do it
115
+ contains_dot = frame [col ].str .contains ("." )
116
+ frame .loc [contains_dot == False , col ] = frame .loc [contains_dot == False , col ].str .replace ("Z" , ".000Z" )
117
+ frame [col ] = pd .to_datetime (frame [col ], errors = "coerce" , ** args )
118
+ return frame [col ]
119
+
120
+
121
+ def parse_timedelta (raw_value : Union [int , float , str ]) -> "pd.Timedelta" :
122
+ """
123
+ Transform a raw python value to a pandas timedelta.
124
+ """
125
+ import pandas as pd
126
+
127
+ if isinstance (raw_value , (int , float )):
128
+ # https://docs.microsoft.com/en-us/dotnet/api/system.datetime.ticks
129
+ # Kusto saves up to ticks, 1 tick == 100 nanoseconds
130
+ return pd .to_timedelta (raw_value * 100 , unit = "ns" )
131
+ if isinstance (raw_value , str ):
132
+ # The timespan format Kusto returns is 'd.hh:mm:ss.ssssss' or 'hh:mm:ss.ssssss' or 'hh:mm:ss'
133
+ # Pandas expects 'd days hh:mm:ss.ssssss' or 'hh:mm:ss.ssssss' or 'hh:mm:ss'
134
+ parts = raw_value .split (":" )
135
+ if "." not in parts [0 ]:
136
+ return pd .to_timedelta (raw_value )
137
+ else :
138
+ formatted_value = raw_value .replace ("." , " days " , 1 )
139
+ return pd .to_timedelta (formatted_value )
0 commit comments