Enable 2011 transect interval filter file ingestion

brandynlucca · brandynlucca · commit 7d7461a597f0 · 2025-04-22T22:42:12.000-07:00
diff --git a/config_files/survey_year_2011_config.yml b/config_files/survey_year_2011_config.yml
@@ -67,6 +67,10 @@ NASC:
     # file that includes all ages
     filename: Exports/US_CAN_NASC_2011_table_all_ages.xlsx
     sheetname: Sheet1
+transect_filter:
+  # Transect interval filtering
+  filename: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/echopop_2011/Kriging_files/Kriging_grid_files/Transect Bounds to 2011.xlsx
+  sheetname: "1995-2011"
 export_regions:
   filename: Stratification/US&CAN_T_reg_haul_final.csv
 kriging:
diff --git a/echopop/compatibility_parameters_test.json b/echopop/compatibility_parameters_test.json
@@ -157,6 +157,9 @@
       "ks"
     ],
     "write_transect_region_file": false,
-    "transect_replicates": 10
+    "transect_replicates": 10,
+    "transect_filter": {
+      "subset_filter": "survey == 201103"
+    }
   }  
 }
diff --git a/echopop/spatial/transect.py b/echopop/spatial/transect.py
@@ -1,4 +1,5 @@
-from typing import List, Union
+from typing import List, Optional, TYPE_CHECKING, Union
+import re
 
 import geopandas as gpd
 import geopy.distance
@@ -7,6 +8,8 @@
 from shapely.geometry import Point, Polygon
 from shapely.ops import unary_union
 
+if TYPE_CHECKING: from ..survey import Survey
+
 from ..spatial.projection import wgs84_to_utm
 
 
@@ -131,6 +134,64 @@ def edit_transect_columns(transect_dict: dict, settings_dict: dict):
     # Return the output
     return transect_info.reset_index()
 
+def filter_transect_intervals(
+    survey_obj: "Survey", 
+    subset_filter: Optional[str] = None,
+) -> pd.DataFrame:
+    """
+    Filter transect intervals based on a subset filter.
+    """
+
+    # Get configuration settings
+    transect_filter_settings = survey_obj.config["transect_filter"]
+
+    # Read in file
+    transect_filter_df = pd.read_excel(transect_filter_settings["filename"], 
+                                       sheet_name=transect_filter_settings["sheetname"])
+
+    # Lowercase column names
+    transect_filter_df.columns = transect_filter_df.columns.str.lower()
+    # ---- Apply hard-coded renaming
+    transect_filter_df.rename(columns={"transect": "transect_num"}, inplace=True)
+
+    # Get the validated input data
+    df = survey_obj.input["acoustics"]["nasc_df"].copy()
+    # ---- Retain original column
+    df_columns = df.columns.tolist()
+
+    # Apply a filter, if needed
+    if subset_filter is not None:
+        # ---- Extract tokens from string
+        tokens = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', subset_filter)
+        # ---- Provide typical Python operator keywords
+        keywords = {"and", "or", "not", "in", "notin", "True", "False"}
+        # ---- Check for column names
+        column_names = [t for t in tokens if t not in keywords and not t.isnumeric()]
+        # ---- Check if all referenced columns exist
+        missing = [col for col in column_names if col not in transect_filter_df.columns]     
+        # ---- Raise error, if needed
+        if missing:
+            raise ValueError(f"Invalid column(s): {', '.join(missing)}")
+        # ---- Apply filter
+        else:
+            transect_filter_df = transect_filter_df.copy().query(subset_filter)
+
+    # Perform a cross join to pair each row in `df` with every row in `fi`
+    df_expanded = df.merge(transect_filter_df[["transect_num", "log_start", "log_end"]], 
+                           on="transect_num", 
+                           how="left")
+
+    # Check for overlap
+    mask = (
+        (df_expanded["distance_e"] >= df_expanded["log_start"]) & 
+        (df_expanded["distance_s"] <= df_expanded["log_end"])
+    )
+
+    # Apply mask to original dataset
+    survey_obj.input["acoustics"]["nasc_df"] = (
+        df_expanded[mask].filter(df_columns).reset_index(drop=True)
+    )
+
 
 def transect_spatial_features(transect_data: pd.DataFrame):
     """
diff --git a/echopop/test_survey.py b/echopop/test_survey.py
@@ -17,6 +17,7 @@
 import glob
 import json
 import os
+import sys
 
 ####################################################################################################
 # CURRENT SURVEY YEAR BEING TESTED: 2019
@@ -54,6 +55,13 @@
     survey.load_acoustic_data()
 else:
     Survey(init_config_path, survey_year_config_path).load_acoustic_data()
+# ---- Test whether transect data must be subset
+if "transect_filter" in parameters:
+    # ---- Conditional import
+    if "filter_transect_intervals" not in dir():
+        from echopop.spatial.transect import filter_transect_intervals
+    # ---- Filter transect intervals
+    filter_transect_intervals(survey, parameters["transect_filter"]["subset_filter"])
 # ---- Load survey data
 survey.load_survey_data()
 # ---- Initial transect analysis test
diff --git a/echopop/utils/validate_dict.py b/echopop/utils/validate_dict.py
@@ -565,6 +565,7 @@ class CONFIG_DATA_MODEL(InputModel):
     data_root_dir: Optional[str] = None
     CAN_haul_offset: Optional[int] = None
     ship_id: Optional[Union[int, str, float]] = None
+    transect_filter: Optional[XLSXFile] = None
     export_regions: Optional[Union[Union[CSVFile, XLSXFile], 
                                    Dict[str, Union[CSVFile, XLSXFile]]]] = None
 

Original file line number	Diff line number	Diff line change
`@@ -157,6 +157,9 @@`
`157`	`157`	`"ks"`
`158`	`158`	`],`
`159`	`159`	`"write_transect_region_file": false,`
`160`		`- "transect_replicates": 10`
	`160`	`+ "transect_replicates": 10,`
	`161`	`+ "transect_filter": {`
	`162`	`+ "subset_filter": "survey == 201103"`
	`163`	`+ }`
`161`	`164`	`}`
`162`	`165`	`}`