databrickslabs
diff --git a/‎docs/user-guide.rst
+43 b/‎docs/user-guide.rst
+43
diff --git a/‎python/requirements.txt
+1 b/‎python/requirements.txt
+1
diff --git a/‎python/tempo/tsdf.py
+197-22 b/‎python/tempo/tsdf.py
+197-22
diff --git a/‎python/tempo/utils.py
+18-8 b/‎python/tempo/utils.py
+18-8
@@ -59,7 +59,50 @@ time column and the optional partition column specification.
      phone_accel_tsdf = TSDF(phone_accel_df, ts_col="event_ts", partition_cols = ["User"])
      display(phone_accel_tsdf)
 
+Slice by Time
+~~~~~~~~~~~~~~~~~~~~~~
+
+You can slice across all timeseries in a TSDF in various ways. This allows you to select or filter by timestamp across
+all the series.
+
+You can select all observations at a specific point in time:
+
+.. code-block:: python
+
+    target_time = '2015-02-23T13:03:53.919+0000'
+    at_target_tsdf = phone_accel_tsdf.at(target_time)
+    display(at_target_tsdf)
+
+You can slice data before or after a particular point in time (either inclusive or exclusive of the target time):
+
+.. code-block:: python
+
+    before_tsdf = phone_accel_tsdf.before(target_time)
+    at_or_after_tsdf = phone_accel_tsdf.atOrAfter(target_time)
+
+Or in an interval between two timestamps:
+
+.. code-block:: python
+
+    start_ts = '2015-02-23T13:03:53.909+0000'
+    end_ts = target_time
+    interval_inclusive = phone_accel_tsdf.between(start_ts, end_ts)
+    interval_exclusive = phone_accel_tsdf.between(start_ts, end_ts, inclusive=False)
+
+You can take a look at the earliest (oldest) or latest (most recent) records across all series:
+
+.. code-block:: python
+
+    n = 5
+    oldest_five_tsdf = phone_accel_tsdf.earliest(n)
+    latest_five_tsdf = phone_accel_tsdf.latest(n)
+
+Or the records immediately before (or after) a particular point in time. This can be thought of like an "as-of" select.
+
+.. code-block:: python
 
+    as_of_tsdf = phone_accel_tsdf.priorTo(target_time)
+    next_five_tsdf = phone_accel_tsdf.subsequentTo(target_time, n=5)
 
 Resample and Visualize
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -19,3 +19,4 @@ Sphinx==4.5.0
 sphinx-design==0.2.0
 sphinx-panels==0.6.0
 jsonref==0.2
+python-dateutil==2.8.2
@@ -8,13 +8,18 @@
 from IPython.display import display as ipydisplay
 from pyspark.sql import SparkSession
 from pyspark.sql.dataframe import DataFrame
-from pyspark.sql.window import Window
+from pyspark.sql.window import Window, WindowSpec
 from scipy.fft import fft, fftfreq
 
 import tempo.io as tio
 import tempo.resample as rs
 from tempo.interpol import Interpolation
-from tempo.utils import ENV_BOOLEAN, PLATFORM, calculate_time_horizon
+from tempo.utils import (
+    ENV_CAN_RENDER_HTML,
+    IS_DATABRICKS,
+    calculate_time_horizon,
+    get_display_df,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -36,7 +41,7 @@ def __init__(self, df, ts_col="event_ts", partition_cols=None, sequence_col=None
         self.partitionCols = (
             []
             if partition_cols is None
-            else self.__validated_columns(df, partition_cols)
+            else self.__validated_columns(df, partition_cols.copy())
         )
 
         self.df = df
@@ -309,6 +314,10 @@ def __getTimePartitions(self, tsPartitionVal, fraction=0.1):
         )
         return TSDF(df, self.ts_col, self.partitionCols + ["ts_partition"])
 
+    #
+    # Slicing & Selection
+    #
+
     def select(self, *cols):
         """
         pyspark.sql.DataFrame.select() method's equivalent for TSDF objects
@@ -342,7 +351,164 @@ def select(self, *cols):
                 "In TSDF's select statement original ts_col, partitionCols and seq_col_stub(optional) must be present"
             )
 
-    def show(self, n=20, truncate=True, vertical=False):
+    def __slice(self, op: str, target_ts):
+        """
+        Private method to slice TSDF by time
+
+        :param op: string symbol of the operation to perform
+        :type op: str
+        :param target_ts: timestamp on which to filter
+
+        :return: a TSDF object containing only those records within the time slice specified
+        """
+        # quote our timestamp if its a string
+        target_expr = f"'{target_ts}'" if isinstance(target_ts, str) else target_ts
+        slice_expr = f.expr(f"{self.ts_col} {op} {target_expr}")
+        sliced_df = self.df.where(slice_expr)
+        return TSDF(
+            sliced_df,
+            ts_col=self.ts_col,
+            partition_cols=self.partitionCols,
+            sequence_col=self.sequence_col,
+        )
+
+    def at(self, ts):
+        """
+        Select only records at a given time
+
+        :param ts: timestamp of the records to select
+
+        :return: a :class:`~tsdf.TSDF` object containing just the records at the given time
+        """
+        return self.__slice("==", ts)
+
+    def before(self, ts):
+        """
+        Select only records before a given time
+
+        :param ts: timestamp on which to filter records
+
+        :return: a :class:`~tsdf.TSDF` object containing just the records before the given time
+        """
+        return self.__slice("<", ts)
+
+    def atOrBefore(self, ts):
+        """
+        Select only records at or before a given time
+
+        :param ts: timestamp on which to filter records
+
+        :return: a :class:`~tsdf.TSDF` object containing just the records at or before the given time
+        """
+        return self.__slice("<=", ts)
+
+    def after(self, ts):
+        """
+        Select only records after a given time
+
+        :param ts: timestamp on which to filter records
+
+        :return: a :class:`~tsdf.TSDF` object containing just the records after the given time
+        """
+        return self.__slice(">", ts)
+
+    def atOrAfter(self, ts):
+        """
+        Select only records at or after a given time
+
+        :param ts: timestamp on which to filter records
+
+        :return: a :class:`~tsdf.TSDF` object containing just the records at or after the given time
+        """
+        return self.__slice(">=", ts)
+
+    def between(self, start_ts, end_ts, inclusive=True):
+        """
+        Select only records in a given range
+
+        :param start_ts: starting time of the range to select
+        :param end_ts: ending time of the range to select
+        :param inclusive: whether the range is inclusive of the endpoints or not, defaults to True
+        :type inclusive: bool
+
+        :return: a :class:`~tsdf.TSDF` object containing just the records within the range specified
+        """
+        if inclusive:
+            return self.atOrAfter(start_ts).atOrBefore(end_ts)
+        return self.after(start_ts).before(end_ts)
+
+    def __top_rows_per_series(self, win: WindowSpec, n: int):
+        """
+        Private method to select just the top n rows per series (as defined by a window ordering)
+
+        :param win: the window on which we order the rows in each series
+        :param n: the number of rows to return
+
+        :return: a :class:`~tsdf.TSDF` object containing just the top n rows in each series
+        """
+        row_num_col = "__row_num"
+        prev_records_df = (
+            self.df.withColumn(row_num_col, f.row_number().over(win))
+            .where(f.col(row_num_col) <= f.lit(n))
+            .drop(row_num_col)
+        )
+        return TSDF(
+            prev_records_df,
+            ts_col=self.ts_col,
+            partition_cols=self.partitionCols,
+            sequence_col=self.sequence_col,
+        )
+
+    def earliest(self, n: int = 1):
+        """
+        Select the earliest n records for each series
+
+        :param n: number of records to select (default is 1)
+
+        :return: a :class:`~tsdf.TSDF` object containing the earliest n records for each series
+        """
+        prev_window = self.__baseWindow(reverse=False)
+        return self.__top_rows_per_series(prev_window, n)
+
+    def latest(self, n: int = 1):
+        """
+        Select the latest n records for each series
+
+        :param n: number of records to select (default is 1)
+
+        :return: a :class:`~tsdf.TSDF` object containing the latest n records for each series
+        """
+        next_window = self.__baseWindow(reverse=True)
+        return self.__top_rows_per_series(next_window, n)
+
+    def priorTo(self, ts, n: int = 1):
+        """
+        Select the n most recent records prior to a given time
+        You can think of this like an 'asOf' select - it selects the records as of a particular time
+
+        :param ts: timestamp on which to filter records
+        :param n: number of records to select (default is 1)
+
+        :return: a :class:`~tsdf.TSDF` object containing the n records prior to the given time
+        """
+        return self.atOrBefore(ts).latest(n)
+
+    def subsequentTo(self, ts, n: int = 1):
+        """
+        Select the n records subsequent to a give time
+
+        :param ts: timestamp on which to filter records
+        :param n: number of records to select (default is 1)
+
+        :return: a :class:`~tsdf.TSDF` object containing the n records subsequent to the given time
+        """
+        return self.atOrAfter(ts).earliest(n)
+
+    #
+    # Display functions
+    #
+
+    def show(self, n=20, k=5, truncate=True, vertical=False):
         """
         pyspark.sql.DataFrame.show() method's equivalent for TSDF objects
 
@@ -372,16 +538,14 @@ def show(self, n=20, truncate=True, vertical=False):
         phone_accel_tsdf.show()
 
         """
-        if PLATFORM == "DATABRICKS" or ENV_BOOLEAN is False:
-            self.df.show(n, truncate, vertical)
-        elif ENV_BOOLEAN:
+        # validate k <= n
+        if k > n:
+            raise ValueError(f"Parameter k {k} cannot be greater than parameter n {n}")
+
+        if not (IS_DATABRICKS) and ENV_CAN_RENDER_HTML:
             # In Jupyter notebooks, for wide dataframes the below line will enable rendering the output in a scrollable format.
             ipydisplay(HTML("<style>pre { white-space: pre !important; }</style>"))
-            self.df.show(n, truncate, vertical)
-        else:
-            self.df.show(
-                n, truncate=False
-            )  # default show method behaviour in case all condition fails
+        get_display_df(self, k).show(n, truncate, vertical)
 
     def describe(self):
         """
@@ -672,23 +836,34 @@ def asofJoin(
 
         return asofDF
 
-    def __baseWindow(self, sort_col=None):
-        # add all sort keys - time is first, unique sequence number breaks the tie
+    def __baseWindow(self, sort_col=None, reverse=False):
+        # figure out our sorting columns
+        primary_sort_col = self.ts_col if not sort_col else sort_col
+        sort_cols = (
+            [primary_sort_col, self.sequence_col]
+            if self.sequence_col
+            else [primary_sort_col]
+        )
 
-        sort_col = self.ts_col if not sort_col else sort_col
-        ptntl_sort_keys = [sort_col, self.sequence_col]
-        sort_keys = [f.col(col_name) for col_name in ptntl_sort_keys if col_name != ""]
+        # are we ordering forwards (default) or reveresed?
+        col_fn = f.col
+        if reverse:
+            col_fn = lambda colname: f.col(colname).desc()  # noqa E731
 
-        w = Window().orderBy(sort_keys)
+        # our window will be sorted on our sort_cols in the appropriate direction
+        w = Window().orderBy([col_fn(col) for col in sort_cols])
+        # and partitioned by any series IDs
         if self.partitionCols:
             w = w.partitionBy([f.col(elem) for elem in self.partitionCols])
         return w
 
-    def __rangeBetweenWindow(self, range_from, range_to, sort_col=None):
-        return self.__baseWindow(sort_col).rangeBetween(range_from, range_to)
+    def __rangeBetweenWindow(self, range_from, range_to, sort_col=None, reverse=False):
+        return self.__baseWindow(sort_col=sort_col, reverse=reverse).rangeBetween(
+            range_from, range_to
+        )
 
-    def __rowsBetweenWindow(self, rows_from, rows_to):
-        return self.__baseWindow().rowsBetween(rows_from, rows_to)
+    def __rowsBetweenWindow(self, rows_from, rows_to, reverse=False):
+        return self.__baseWindow(reverse=reverse).rowsBetween(rows_from, rows_to)
 
     def withPartitionCols(self, partitionCols):
         """
 
@@ -8,10 +8,11 @@
 from pandas import DataFrame as pandasDataFrame
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.functions import expr, max, min, sum, percentile_approx
+
 from tempo.resample import checkAllowableFreq, freq_dict
 
 logger = logging.getLogger(__name__)
-PLATFORM = "DATABRICKS" if "DB_HOME" in os.environ.keys() else "NON_DATABRICKS"
+IS_DATABRICKS = "DB_HOME" in os.environ.keys()
 
 """
 DB_HOME env variable has been chosen and that's because this variable is a special variable that will be available in DBR.
@@ -139,31 +140,40 @@ def display_unavailable(df):
     )
 
 
-ENV_BOOLEAN = __is_capable_of_html_rendering()
+def get_display_df(tsdf, k):
+    # let's show the n most recent records per series, in order:
+    orderCols = tsdf.partitionCols.copy()
+    orderCols.append(tsdf.ts_col)
+    if tsdf.sequence_col:
+        orderCols.append(tsdf.sequence_col)
+    return tsdf.latest(k).df.orderBy(orderCols)
+
+
+ENV_CAN_RENDER_HTML = __is_capable_of_html_rendering()
 
 
 if (
-    (PLATFORM == "DATABRICKS")
-    and not isinstance(get_ipython(), None)
-    and "display" in get_ipython().user_ns.keys()
+    IS_DATABRICKS
+    and not (get_ipython() is None)
+    and ("display" in get_ipython().user_ns.keys())
 ):
     method = get_ipython().user_ns["display"]
     # Under 'display' key in user_ns the original databricks display method is present
     # to know more refer: /databricks/python_shell/scripts/db_ipykernel_launcher.py
 
     def display_improvised(obj):
         if type(obj).__name__ == "TSDF":
-            method(obj.df)
+            method(get_display_df(obj, k=5))
         else:
             method(obj)
 
     display = display_improvised
 
-elif ENV_BOOLEAN:
+elif ENV_CAN_RENDER_HTML:
 
     def display_html_improvised(obj):
         if type(obj).__name__ == "TSDF":
-            display_html(obj.df)
+            display_html(get_display_df(obj, k=5))
         else:
             display_html(obj)