Skip to content

Commit 15a1bcd

Browse files
ctuguinayoftfrfbf
authored andcommitted
Drop Ping Time Duplicates (OSOceanAcoustics#1382)
* init commit * revert change to fix merge conflict * test only one file * use other file * move test duplicate to test convert ek * add extra line * move test back to ek80 convert * pin zarr and add check unique ping time duplicates and tests * fix test message * test remove zarr pin * add back zarr pin
1 parent 9f8a406 commit 15a1bcd

File tree

3 files changed

+115
-0
lines changed

3 files changed

+115
-0
lines changed

echopype/convert/set_groups_ek80.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from ..utils.coding import set_time_encodings
99
from ..utils.log import _init_logger
1010
from .set_groups_base import SetGroupsBase
11+
from .utils.ek_duplicates import check_unique_ping_time_duplicates
1112

1213
logger = _init_logger(__name__)
1314

@@ -1145,6 +1146,17 @@ def set_beam(self) -> List[xr.Dataset]:
11451146

11461147
ds_data = self._attach_vars_to_ds_data(ds_data, ch, rs_size=ds_data.range_sample.size)
11471148

1149+
# Access the 'ping_time' coordinate as a NumPy array
1150+
ping_times = ds_data["ping_time"].values
1151+
1152+
# Check if ping time duplicates exist
1153+
if len(ping_times) > len(np.unique(ping_times)):
1154+
# Check for unique ping time duplicates and if they are not unique, raise warning.
1155+
check_unique_ping_time_duplicates(ds_data, logger)
1156+
1157+
# Drop duplicates
1158+
ds_data = ds_data.drop_duplicates(dim="ping_time")
1159+
11481160
if ch in self.sorted_channel["complex"]:
11491161
ds_complex.append(ds_data)
11501162
else:
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import logging
2+
3+
import xarray as xr
4+
5+
6+
def check_unique_ping_time_duplicates(ds_data: xr.Dataset, logger: logging.Logger) -> None:
7+
"""
8+
Raises a warning if the data stored in duplicate pings is not unique.
9+
10+
Parameters
11+
----------
12+
ds_data : xr.Dataset
13+
Single freq beam dataset being processed in the `SetGroupsEK80.set_beams` class function.
14+
logger : logging.Logger
15+
Warning logger initialized in `SetGroupsEK80` file.
16+
"""
17+
# Group the dataset by the "ping_time" coordinate
18+
groups = ds_data.groupby("ping_time")
19+
20+
# Loop through each ping_time group
21+
for ping_time_val, group in groups:
22+
# Extract all data variable names to check
23+
data_vars = list(group.data_vars)
24+
25+
# Use the first duplicate ping time index as a reference
26+
ref_duplicate_ping_time_index = 0
27+
28+
# Iterate over each data variable in the group
29+
for var in data_vars:
30+
# Extract data array corresponding to the iterated variable
31+
data_array = group[var]
32+
33+
# Use the slice corresponding to the reference index as the reference slice
34+
ref_slice = data_array.isel({"ping_time": ref_duplicate_ping_time_index})
35+
36+
# Iterate over the remaining entries
37+
for i in range(1, data_array.sizes["ping_time"]):
38+
if not ref_slice.equals(data_array.isel({"ping_time": i})):
39+
logger.warning(
40+
f"Duplicate slices in variable '{var}' corresponding to 'ping_time' "
41+
f"{ping_time_val} differ in data. All duplicate 'ping_time' entries "
42+
"will be removed, which will result in data loss."
43+
)
44+
break

echopype/tests/convert/test_convert_ek80.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,14 @@
44
import numpy as np
55
import pandas as pd
66
from scipy.io import loadmat
7+
import xarray as xr
78

89
from echopype import open_raw, open_converted
910
from echopype.testing import TEST_DATA_FOLDER
1011
from echopype.convert.parse_ek80 import ParseEK80
1112
from echopype.convert.set_groups_ek80 import WIDE_BAND_TRANS, PULSE_COMPRESS, FILTER_IMAG, FILTER_REAL, DECIMATION
13+
from echopype.utils import log
14+
from echopype.convert.utils.ek_duplicates import check_unique_ping_time_duplicates
1215

1316

1417
@pytest.fixture
@@ -512,6 +515,62 @@ def test_parse_missing_sound_velocity_profile():
512515
shutil.rmtree(save_path)
513516

514517

518+
@pytest.mark.unit
519+
def test_duplicate_ping_times(caplog):
520+
"""
521+
Tests that RAW file with duplicate ping times can be parsed and that the correct warning has been raised.
522+
"""
523+
# Turn on logger verbosity
524+
log.verbose(override=False)
525+
526+
# Open RAW
527+
ed = open_raw("echopype/test_data/ek80_duplicate_ping_times/Hake-D20210913-T130612.raw", sonar_model="EK80")
528+
529+
# Check that there are no ping time duplicates in Beam group
530+
assert ed["Sonar/Beam_group1"].equals(
531+
ed["Sonar/Beam_group1"].drop_duplicates(dim="ping_time")
532+
)
533+
534+
# Check that no warning is logged since the data for all duplicate pings is unique
535+
not_expected_warning = ("All duplicate ping_time entries' will be removed, resulting in potential data loss.")
536+
assert not any(not_expected_warning in record.message for record in caplog.records)
537+
538+
# Turn off logger verbosity
539+
log.verbose(override=True)
540+
541+
542+
@pytest.mark.unit
543+
def test_check_unique_ping_time_duplicates(caplog):
544+
"""
545+
Checks that `check_unique_ping_time_duplicates` raises a warning when the data for duplicate ping times is not unique.
546+
"""
547+
# Initialize logger
548+
logger = log._init_logger(__name__)
549+
550+
# Turn on logger verbosity
551+
log.verbose(override=False)
552+
553+
# Open duplicate ping time beam dataset
554+
ds_data = xr.open_zarr("echopype/test_data/ek80_duplicate_ping_times/duplicate_beam_ds.zarr")
555+
556+
# Modify a single entry to ensure that there exists duplicate ping times that do not share the same backscatter data
557+
ds_data["backscatter_r"][0,0,0] = 0
558+
559+
# Check for ping time duplicates
560+
check_unique_ping_time_duplicates(ds_data, logger)
561+
562+
# Turn off logger verbosity
563+
log.verbose(override=True)
564+
565+
# Check if the expected warning is logged
566+
expected_warning = (
567+
"Duplicate slices in variable 'backscatter_r' corresponding to 'ping_time' "
568+
f"{str(ds_data['ping_time'].values[0])} differ in data. All duplicate "
569+
"'ping_time' entries will be removed, which will result in data loss."
570+
)
571+
assert any(expected_warning in record.message for record in caplog.records)
572+
573+
515574
@pytest.mark.unit
516575
def test_parse_ek80_with_invalid_env_datagrams():
517576
"""

0 commit comments

Comments
 (0)