8
8
from IPython .display import display as ipydisplay
9
9
from pyspark .sql import SparkSession
10
10
from pyspark .sql .dataframe import DataFrame
11
- from pyspark .sql .window import Window
11
+ from pyspark .sql .window import Window , WindowSpec
12
12
from scipy .fft import fft , fftfreq
13
13
14
14
import tempo .io as tio
15
15
import tempo .resample as rs
16
16
from tempo .interpol import Interpolation
17
- from tempo .utils import ENV_BOOLEAN , PLATFORM , calculate_time_horizon
17
+ from tempo .utils import (
18
+ ENV_CAN_RENDER_HTML ,
19
+ IS_DATABRICKS ,
20
+ calculate_time_horizon ,
21
+ get_display_df ,
22
+ )
18
23
19
24
logger = logging .getLogger (__name__ )
20
25
@@ -36,7 +41,7 @@ def __init__(self, df, ts_col="event_ts", partition_cols=None, sequence_col=None
36
41
self .partitionCols = (
37
42
[]
38
43
if partition_cols is None
39
- else self .__validated_columns (df , partition_cols )
44
+ else self .__validated_columns (df , partition_cols . copy () )
40
45
)
41
46
42
47
self .df = df
@@ -309,6 +314,10 @@ def __getTimePartitions(self, tsPartitionVal, fraction=0.1):
309
314
)
310
315
return TSDF (df , self .ts_col , self .partitionCols + ["ts_partition" ])
311
316
317
+ #
318
+ # Slicing & Selection
319
+ #
320
+
312
321
def select (self , * cols ):
313
322
"""
314
323
pyspark.sql.DataFrame.select() method's equivalent for TSDF objects
@@ -342,7 +351,164 @@ def select(self, *cols):
342
351
"In TSDF's select statement original ts_col, partitionCols and seq_col_stub(optional) must be present"
343
352
)
344
353
345
- def show (self , n = 20 , truncate = True , vertical = False ):
354
+ def __slice (self , op : str , target_ts ):
355
+ """
356
+ Private method to slice TSDF by time
357
+
358
+ :param op: string symbol of the operation to perform
359
+ :type op: str
360
+ :param target_ts: timestamp on which to filter
361
+
362
+ :return: a TSDF object containing only those records within the time slice specified
363
+ """
364
+ # quote our timestamp if its a string
365
+ target_expr = f"'{ target_ts } '" if isinstance (target_ts , str ) else target_ts
366
+ slice_expr = f .expr (f"{ self .ts_col } { op } { target_expr } " )
367
+ sliced_df = self .df .where (slice_expr )
368
+ return TSDF (
369
+ sliced_df ,
370
+ ts_col = self .ts_col ,
371
+ partition_cols = self .partitionCols ,
372
+ sequence_col = self .sequence_col ,
373
+ )
374
+
375
+ def at (self , ts ):
376
+ """
377
+ Select only records at a given time
378
+
379
+ :param ts: timestamp of the records to select
380
+
381
+ :return: a :class:`~tsdf.TSDF` object containing just the records at the given time
382
+ """
383
+ return self .__slice ("==" , ts )
384
+
385
+ def before (self , ts ):
386
+ """
387
+ Select only records before a given time
388
+
389
+ :param ts: timestamp on which to filter records
390
+
391
+ :return: a :class:`~tsdf.TSDF` object containing just the records before the given time
392
+ """
393
+ return self .__slice ("<" , ts )
394
+
395
+ def atOrBefore (self , ts ):
396
+ """
397
+ Select only records at or before a given time
398
+
399
+ :param ts: timestamp on which to filter records
400
+
401
+ :return: a :class:`~tsdf.TSDF` object containing just the records at or before the given time
402
+ """
403
+ return self .__slice ("<=" , ts )
404
+
405
+ def after (self , ts ):
406
+ """
407
+ Select only records after a given time
408
+
409
+ :param ts: timestamp on which to filter records
410
+
411
+ :return: a :class:`~tsdf.TSDF` object containing just the records after the given time
412
+ """
413
+ return self .__slice (">" , ts )
414
+
415
+ def atOrAfter (self , ts ):
416
+ """
417
+ Select only records at or after a given time
418
+
419
+ :param ts: timestamp on which to filter records
420
+
421
+ :return: a :class:`~tsdf.TSDF` object containing just the records at or after the given time
422
+ """
423
+ return self .__slice (">=" , ts )
424
+
425
+ def between (self , start_ts , end_ts , inclusive = True ):
426
+ """
427
+ Select only records in a given range
428
+
429
+ :param start_ts: starting time of the range to select
430
+ :param end_ts: ending time of the range to select
431
+ :param inclusive: whether the range is inclusive of the endpoints or not, defaults to True
432
+ :type inclusive: bool
433
+
434
+ :return: a :class:`~tsdf.TSDF` object containing just the records within the range specified
435
+ """
436
+ if inclusive :
437
+ return self .atOrAfter (start_ts ).atOrBefore (end_ts )
438
+ return self .after (start_ts ).before (end_ts )
439
+
440
+ def __top_rows_per_series (self , win : WindowSpec , n : int ):
441
+ """
442
+ Private method to select just the top n rows per series (as defined by a window ordering)
443
+
444
+ :param win: the window on which we order the rows in each series
445
+ :param n: the number of rows to return
446
+
447
+ :return: a :class:`~tsdf.TSDF` object containing just the top n rows in each series
448
+ """
449
+ row_num_col = "__row_num"
450
+ prev_records_df = (
451
+ self .df .withColumn (row_num_col , f .row_number ().over (win ))
452
+ .where (f .col (row_num_col ) <= f .lit (n ))
453
+ .drop (row_num_col )
454
+ )
455
+ return TSDF (
456
+ prev_records_df ,
457
+ ts_col = self .ts_col ,
458
+ partition_cols = self .partitionCols ,
459
+ sequence_col = self .sequence_col ,
460
+ )
461
+
462
+ def earliest (self , n : int = 1 ):
463
+ """
464
+ Select the earliest n records for each series
465
+
466
+ :param n: number of records to select (default is 1)
467
+
468
+ :return: a :class:`~tsdf.TSDF` object containing the earliest n records for each series
469
+ """
470
+ prev_window = self .__baseWindow (reverse = False )
471
+ return self .__top_rows_per_series (prev_window , n )
472
+
473
+ def latest (self , n : int = 1 ):
474
+ """
475
+ Select the latest n records for each series
476
+
477
+ :param n: number of records to select (default is 1)
478
+
479
+ :return: a :class:`~tsdf.TSDF` object containing the latest n records for each series
480
+ """
481
+ next_window = self .__baseWindow (reverse = True )
482
+ return self .__top_rows_per_series (next_window , n )
483
+
484
+ def priorTo (self , ts , n : int = 1 ):
485
+ """
486
+ Select the n most recent records prior to a given time
487
+ You can think of this like an 'asOf' select - it selects the records as of a particular time
488
+
489
+ :param ts: timestamp on which to filter records
490
+ :param n: number of records to select (default is 1)
491
+
492
+ :return: a :class:`~tsdf.TSDF` object containing the n records prior to the given time
493
+ """
494
+ return self .atOrBefore (ts ).latest (n )
495
+
496
+ def subsequentTo (self , ts , n : int = 1 ):
497
+ """
498
+ Select the n records subsequent to a give time
499
+
500
+ :param ts: timestamp on which to filter records
501
+ :param n: number of records to select (default is 1)
502
+
503
+ :return: a :class:`~tsdf.TSDF` object containing the n records subsequent to the given time
504
+ """
505
+ return self .atOrAfter (ts ).earliest (n )
506
+
507
+ #
508
+ # Display functions
509
+ #
510
+
511
+ def show (self , n = 20 , k = 5 , truncate = True , vertical = False ):
346
512
"""
347
513
pyspark.sql.DataFrame.show() method's equivalent for TSDF objects
348
514
@@ -372,16 +538,14 @@ def show(self, n=20, truncate=True, vertical=False):
372
538
phone_accel_tsdf.show()
373
539
374
540
"""
375
- if PLATFORM == "DATABRICKS" or ENV_BOOLEAN is False :
376
- self .df .show (n , truncate , vertical )
377
- elif ENV_BOOLEAN :
541
+ # validate k <= n
542
+ if k > n :
543
+ raise ValueError (f"Parameter k { k } cannot be greater than parameter n { n } " )
544
+
545
+ if not (IS_DATABRICKS ) and ENV_CAN_RENDER_HTML :
378
546
# In Jupyter notebooks, for wide dataframes the below line will enable rendering the output in a scrollable format.
379
547
ipydisplay (HTML ("<style>pre { white-space: pre !important; }</style>" ))
380
- self .df .show (n , truncate , vertical )
381
- else :
382
- self .df .show (
383
- n , truncate = False
384
- ) # default show method behaviour in case all condition fails
548
+ get_display_df (self , k ).show (n , truncate , vertical )
385
549
386
550
def describe (self ):
387
551
"""
@@ -672,23 +836,34 @@ def asofJoin(
672
836
673
837
return asofDF
674
838
675
- def __baseWindow (self , sort_col = None ):
676
- # add all sort keys - time is first, unique sequence number breaks the tie
839
+ def __baseWindow (self , sort_col = None , reverse = False ):
840
+ # figure out our sorting columns
841
+ primary_sort_col = self .ts_col if not sort_col else sort_col
842
+ sort_cols = (
843
+ [primary_sort_col , self .sequence_col ]
844
+ if self .sequence_col
845
+ else [primary_sort_col ]
846
+ )
677
847
678
- sort_col = self .ts_col if not sort_col else sort_col
679
- ptntl_sort_keys = [sort_col , self .sequence_col ]
680
- sort_keys = [f .col (col_name ) for col_name in ptntl_sort_keys if col_name != "" ]
848
+ # are we ordering forwards (default) or reveresed?
849
+ col_fn = f .col
850
+ if reverse :
851
+ col_fn = lambda colname : f .col (colname ).desc () # noqa E731
681
852
682
- w = Window ().orderBy (sort_keys )
853
+ # our window will be sorted on our sort_cols in the appropriate direction
854
+ w = Window ().orderBy ([col_fn (col ) for col in sort_cols ])
855
+ # and partitioned by any series IDs
683
856
if self .partitionCols :
684
857
w = w .partitionBy ([f .col (elem ) for elem in self .partitionCols ])
685
858
return w
686
859
687
- def __rangeBetweenWindow (self , range_from , range_to , sort_col = None ):
688
- return self .__baseWindow (sort_col ).rangeBetween (range_from , range_to )
860
+ def __rangeBetweenWindow (self , range_from , range_to , sort_col = None , reverse = False ):
861
+ return self .__baseWindow (sort_col = sort_col , reverse = reverse ).rangeBetween (
862
+ range_from , range_to
863
+ )
689
864
690
- def __rowsBetweenWindow (self , rows_from , rows_to ):
691
- return self .__baseWindow ().rowsBetween (rows_from , rows_to )
865
+ def __rowsBetweenWindow (self , rows_from , rows_to , reverse = False ):
866
+ return self .__baseWindow (reverse = reverse ).rowsBetween (rows_from , rows_to )
692
867
693
868
def withPartitionCols (self , partitionCols ):
694
869
"""
0 commit comments