From 921a4d9b86071fcbe566493e2f2c47773fe68775 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 10 Oct 2022 17:07:31 -0700 Subject: [PATCH 01/28] establish core function for binning and computing the mean of a 2D array --- echopype/preprocess/api.py | 63 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 5b6e4be02..a0b05ec2c 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import xarray as xr +import dask.array from ..utils.prov import echopype_prov_attrs from .noise_est import NoiseEst @@ -329,3 +330,65 @@ def remove_noise(ds_Sv, ping_num, range_sample_num, noise_max=None, SNR_threshol def regrid(): return 1 + + +def scale_array(array_vals, new_min, new_max): + old_max = array_vals.max() + old_min = array_vals.min() + + # TODO: error out for old_max == old_min + + old_range = (old_max - old_min) + new_range = (new_max - new_min) + new_array = (((array_vals - old_min) * new_range) / old_range) + new_min + + return new_array + + +def create_known_mean_data(): + + num_pings_in_bin = [4, 5, 6] + num_er_in_bin = [10, 20, 11] + + er_bin_ranges = [(0, 5), (5.1, 10), (10.1, 20)] + + arrays = [] + means = [] + for count, er_bin in enumerate(num_er_in_bin): + + temp = [] + for ping_bin in num_pings_in_bin: + a = np.random.random_sample((er_bin, ping_bin)) + + # scale a + a = scale_array(a, er_bin_ranges[count][0], er_bin_ranges[count][1]) + + temp.append(a) + + means.append([arr.mean() for arr in temp]) + arrays.append(np.concatenate(temp, axis=1)) + + full_array = np.concatenate(arrays, axis=0) + + +def bin_and_mean(arr, bins_ping, bins_er, pings): + + if isinstance(pings, dask.array): + bin_ping_ind = np.digitize(pings.compute(), bins_ping, right=False) + else: + bin_ping_ind = np.digitize(pings, bins_ping, right=False) + + # possible bin value, note: 0 means less than first bin val, + # and len(bins) + 1 is greater than the last bin value + n_bin_ping = len(bins_ping) + 1 + n_bin_er = len(bins_er) + 1 + + binned_means = [] + for bin_ping in range(n_bin_ping): + + da_binned_ping = arr[:, bin_ping_ind == bin_ping] + + bin_er_ind = np.digitize(da_binned_ping, bins_er, right=False) + + for bin_er in range(n_bin_er): + binned_means.append(np.nanmean(da_binned_ping[bin_er_ind == bin_er])) \ No newline at end of file From 12c0d69f42d4f328c59c47d51921eaa4c7023573 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 11 Oct 2022 10:22:16 -0700 Subject: [PATCH 02/28] finish structure of bin_and_mean_2d and create simple tests for it --- echopype/preprocess/api.py | 71 +++++++----------- echopype/tests/preprocess/test_preprocess.py | 76 ++++++++++++++++++++ 2 files changed, 103 insertions(+), 44 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index a0b05ec2c..9234e6ff5 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -2,10 +2,10 @@ Functions for enhancing the spatial and temporal coherence of data. """ +import dask.array import numpy as np import pandas as pd import xarray as xr -import dask.array from ..utils.prov import echopype_prov_attrs from .noise_est import NoiseEst @@ -332,63 +332,46 @@ def regrid(): return 1 -def scale_array(array_vals, new_min, new_max): - old_max = array_vals.max() - old_min = array_vals.min() - - # TODO: error out for old_max == old_min - - old_range = (old_max - old_min) - new_range = (new_max - new_min) - new_array = (((array_vals - old_min) * new_range) / old_range) + new_min - - return new_array - - -def create_known_mean_data(): - - num_pings_in_bin = [4, 5, 6] - num_er_in_bin = [10, 20, 11] - - er_bin_ranges = [(0, 5), (5.1, 10), (10.1, 20)] - - arrays = [] - means = [] - for count, er_bin in enumerate(num_er_in_bin): - - temp = [] - for ping_bin in num_pings_in_bin: - a = np.random.random_sample((er_bin, ping_bin)) - - # scale a - a = scale_array(a, er_bin_ranges[count][0], er_bin_ranges[count][1]) +def bin_and_mean_2d(arr, bins_ping, bins_er, pings): + """ + Bins and means a 2D array - temp.append(a) - means.append([arr.mean() for arr in temp]) - arrays.append(np.concatenate(temp, axis=1)) + bins_ping, bins_er are bins in np.digitize format - full_array = np.concatenate(arrays, axis=0) + TODO: document! + Notes + ----- + This function assumes that values in ``arr`` are not outside + of the bins presecribed by ``bins_er``. Additionally, it also + assumes that values in ``pings`` are not outside the bins given + by bins_ping. + """ -def bin_and_mean(arr, bins_ping, bins_er, pings): + # TODO: put in check to make sure arr is 2D - if isinstance(pings, dask.array): + if isinstance(pings, dask.array.Array): bin_ping_ind = np.digitize(pings.compute(), bins_ping, right=False) else: bin_ping_ind = np.digitize(pings, bins_ping, right=False) - # possible bin value, note: 0 means less than first bin val, - # and len(bins) + 1 is greater than the last bin value - n_bin_ping = len(bins_ping) + 1 - n_bin_er = len(bins_er) + 1 + n_bin_ping = len(bins_ping) + n_bin_er = len(bins_er) binned_means = [] - for bin_ping in range(n_bin_ping): + for bin_ping in range(1, n_bin_ping): da_binned_ping = arr[:, bin_ping_ind == bin_ping] bin_er_ind = np.digitize(da_binned_ping, bins_er, right=False) - for bin_er in range(n_bin_er): - binned_means.append(np.nanmean(da_binned_ping[bin_er_ind == bin_er])) \ No newline at end of file + for bin_er in range(1, n_bin_er): + binned_means.append(np.nanmean(da_binned_ping[bin_er_ind == bin_er])) + + if isinstance(arr, dask.array.Array): + means = dask.array.from_array(binned_means) + else: + means = np.array(binned_means) + + return means.reshape((n_bin_ping - 1, n_bin_er - 1)) diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index 58be7cff0..e2876a6d7 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -3,6 +3,9 @@ import xarray as xr import echopype as ep import pytest +import dask.array + +from echopype.preprocess.api import bin_and_mean_2d @pytest.fixture( @@ -507,3 +510,76 @@ def test_preprocess_mvbs(test_data_samples): range_kwargs.pop('azfp_cal_type') Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) assert ep.preprocess.compute_MVBS(Sv) is not None + + +def create_bins(csum_array): + # TODO: document! + bins = [] + for count, csum in enumerate(csum_array): + + if count == 0: + bins.append((0, csum)) + else: + # add 0.01 so that left bins don't overlap + bins.append((csum_array[count-1] + 0.01, csum)) + return bins + + +def create_known_mean_data(create_dask, num_pings_in_bin, num_er_in_bin): + # TODO: document! + + ping_csum = np.cumsum(num_pings_in_bin) + ping_bins = create_bins(ping_csum) + + er_csum = np.cumsum(num_er_in_bin) + er_bins = create_bins(er_csum) + + arrays = [] + means = [] + for ping_bin in num_pings_in_bin: + + temp = [] + for count, bin_val in enumerate(er_bins): + + if create_dask: + a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[count], ping_bin)) + else: + a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[count], ping_bin)) + + temp.append(a) + + means.append([arr.mean() for arr in temp]) + arrays.append(np.concatenate(temp, axis=0)) + + if create_dask: + means = dask.array.from_array(means).compute() + + full_array = np.concatenate(arrays, axis=1) + + return means, full_array, ping_bins, er_bins + + +def test_bin_and_mean_2d(): + # TODO: document and create a fixture with input of create_dask + + create_dask = True + + num_pings_in_bin = np.random.randint(low=10, high=1000, size=10) + num_er_in_bin = np.random.randint(low=10, high=1000, size=10) + + means, full_array, ping_bins, er_bins = create_known_mean_data(create_dask, + num_pings_in_bin, num_er_in_bin) + + ping_times = np.concatenate([np.random.uniform(bin_val[0], bin_val[1], num_pings_in_bin[count]) + for count, bin_val in enumerate(ping_bins)]) + + digitize_ping_bin = [*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:]] + digitize_er_bin = [*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]] + + calc_means = bin_and_mean_2d(full_array, digitize_ping_bin, digitize_er_bin, ping_times) + + if create_dask: + calc_means = calc_means.compute() + + assert np.allclose(calc_means, means, atol=1e-10, rtol=1e-10) + From 0141f8ec1cd752f4423aa9ea3aed4ac8bb206157 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 17 Oct 2022 17:06:09 -0700 Subject: [PATCH 03/28] modify bin_and_mean_2d so that it produces the appropriate values and add get_MVBS_along_channels, which computes MVBS for each channel --- echopype/preprocess/api.py | 85 +++++++++++++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 11 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 9234e6ff5..ac062a8cb 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -332,7 +332,7 @@ def regrid(): return 1 -def bin_and_mean_2d(arr, bins_ping, bins_er, pings): +def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range): """ Bins and means a 2D array @@ -351,27 +351,90 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings): # TODO: put in check to make sure arr is 2D + # turn datetime into integers, so we can use np.digitize if isinstance(pings, dask.array.Array): - bin_ping_ind = np.digitize(pings.compute(), bins_ping, right=False) + pings_i8 = pings.compute().data.view("i8") else: - bin_ping_ind = np.digitize(pings, bins_ping, right=False) + pings_i8 = pings.view("i8") + + # turn datetime into integers, so we can use np.digitize + bins_ping_i8 = bins_ping.view("i8") + + # get bin index for each ping + bin_ping_ind = np.digitize(pings_i8, bins_ping_i8, right=False) + + # get bin index for each echo_range value + if isinstance(echo_range, dask.array.Array): + bin_er_ind = np.digitize(echo_range.compute(), bins_er, right=False) + else: + bin_er_ind = np.digitize(echo_range, bins_er, right=False) n_bin_ping = len(bins_ping) n_bin_er = len(bins_er) binned_means = [] - for bin_ping in range(1, n_bin_ping): - - da_binned_ping = arr[:, bin_ping_ind == bin_ping] + for bin_ping in range(1, n_bin_ping + 1): - bin_er_ind = np.digitize(da_binned_ping, bins_er, right=False) + # ping_selected_data = arr[bin_ping_ind == bin_ping, :] + ping_selected_data = np.nanmean(arr[bin_ping_ind == bin_ping, :], axis=0) + data_rows = [] + # TODO: can we do a mapping here, instead of a for loop? for bin_er in range(1, n_bin_er): - binned_means.append(np.nanmean(da_binned_ping[bin_er_ind == bin_er])) + # data_rows.append(np.nanmean(ping_selected_data[:, bin_er_ind == bin_er])) + data_rows.append(np.nanmean(ping_selected_data[bin_er_ind == bin_er])) + + # TODO:: can we use something besides stack that is more efficient? + binned_means.append(np.stack(data_rows, axis=0)) if isinstance(arr, dask.array.Array): - means = dask.array.from_array(binned_means) + return 10 * np.log10(np.stack(binned_means, axis=0).rechunk("auto")) else: - means = np.array(binned_means) + return 10 * np.log10(np.stack(binned_means, axis=0)) + + +def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): + + # range_meter_bin = 5 + # range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) + # ping_interval = np.array(list(ds_Sv.ping_time. + # resample(ping_time='20s', skipna=True).groups.keys())) + + all_MVBS = [] + for chan in ds_Sv.channel: + + # squeeze to remove "channel" dim if present + # TODO: not sure why not already removed for the AZFP case. Investigate. + ds = ds_Sv.sel(channel=chan).squeeze() + + # average should be done in linear domain + sv = 10 ** (ds["Sv"] / 10) + + # set 1D coordinate using the 1st ping echo_range since identical for all pings + # remove padded NaN entries if exist for all pings + er = ( + ds["echo_range"] + .dropna(dim="range_sample", how="all") + .dropna(dim="ping_time") + .isel(ping_time=0) + ) + + # use first ping er as indexer for sv + sv = sv.sel(range_sample=er.range_sample.values) + sv.coords["echo_range"] = (["range_sample"], er.values) + sv = sv.swap_dims({"range_sample": "echo_range"}) + + # collect the MVBS calculated for the channel + all_MVBS.append( + bin_and_mean_2d( + sv.data, + bins_ping=ping_interval, + bins_er=range_interval, + pings=sv.ping_time.data, + echo_range=sv.echo_range.data, + ) + ) + + MVBS_values = np.stack(all_MVBS, axis=0) - return means.reshape((n_bin_ping - 1, n_bin_er - 1)) + return MVBS_values From 7e230e3271d4bb706bed00e7cc1dfe52a164aca2 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 18 Oct 2022 12:08:27 -0700 Subject: [PATCH 04/28] investigate more efficient dask graph --- echopype/preprocess/api.py | 67 +++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index ac062a8cb..838baa30a 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -373,24 +373,48 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range): n_bin_er = len(bins_er) binned_means = [] - for bin_ping in range(1, n_bin_ping + 1): - - # ping_selected_data = arr[bin_ping_ind == bin_ping, :] - ping_selected_data = np.nanmean(arr[bin_ping_ind == bin_ping, :], axis=0) - - data_rows = [] - # TODO: can we do a mapping here, instead of a for loop? - for bin_er in range(1, n_bin_er): - # data_rows.append(np.nanmean(ping_selected_data[:, bin_er_ind == bin_er])) - data_rows.append(np.nanmean(ping_selected_data[bin_er_ind == bin_er])) - - # TODO:: can we use something besides stack that is more efficient? - binned_means.append(np.stack(data_rows, axis=0)) - - if isinstance(arr, dask.array.Array): - return 10 * np.log10(np.stack(binned_means, axis=0).rechunk("auto")) - else: - return 10 * np.log10(np.stack(binned_means, axis=0)) + for bin_er in range(1, n_bin_er): + # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) + + # er_selected_data = arr[:, bin_er_ind == bin_er] + er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) + # + # binned_means.append(er_selected_data) + + data_rows = [] # TODO: rename? + for bin_ping in range(1, n_bin_ping + 1): + # data_rows.append(np.nanmean(er_selected_data[bin_ping_ind == bin_ping])) + # data_rows.append(er_selected_data[bin_ping_ind == bin_ping, :]) + data_rows.append(er_selected_data[bin_ping_ind == bin_ping]) + # + # binned_means.append(data_rows) + temp = np.concatenate(data_rows, axis=0) + temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk('auto') #[None, None], chunks=(1, 1)) + binned_means.append(temp) + + stacked_means = np.transpose(np.vstack(binned_means)).rechunk('auto') + # return binned_means + return 10 * np.log10(stacked_means) + + # binned_means = [] + # for bin_ping in range(1, n_bin_ping + 1): + # + # # ping_selected_data = arr[bin_ping_ind == bin_ping, :] + # ping_selected_data = np.nanmean(arr[bin_ping_ind == bin_ping, :], axis=0) + # + # data_rows = [] + # # TODO: can we do a mapping here, instead of a for loop? + # for bin_er in range(1, n_bin_er): + # # data_rows.append(np.nanmean(ping_selected_data[:, bin_er_ind == bin_er])) + # data_rows.append(np.nanmean(ping_selected_data[bin_er_ind == bin_er])) + # + # # TODO:: can we use something besides stack that is more efficient? + # binned_means.append(np.stack(data_rows, axis=0)) + + # if isinstance(arr, dask.array.Array): + # return 10 * np.log10(np.transpose(np.stack(binned_means, axis=0))) #.rechunk("auto"))) + # else: + # return 10 * np.log10(np.transpose(np.stack(binned_means, axis=0))) def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): @@ -401,7 +425,7 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): # resample(ping_time='20s', skipna=True).groups.keys())) all_MVBS = [] - for chan in ds_Sv.channel: + for chan in ds_Sv.channel[:1]: # squeeze to remove "channel" dim if present # TODO: not sure why not already removed for the AZFP case. Investigate. @@ -435,6 +459,5 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): ) ) - MVBS_values = np.stack(all_MVBS, axis=0) - - return MVBS_values + # return all_MVBS + return np.stack(all_MVBS, axis=0) From 43eb06294c11b3ff499330ab78301b46c1337120 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 19 Oct 2022 10:09:23 -0700 Subject: [PATCH 05/28] investigate chunking of MVBS computation --- echopype/preprocess/api.py | 50 +++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 838baa30a..3cebcbe04 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -372,28 +372,51 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range): n_bin_ping = len(bins_ping) n_bin_er = len(bins_er) + # binned_means = [] + # for bin_er in range(1, n_bin_er): + # # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) + # er_selected_data = arr[:, bin_er_ind == bin_er] + # + # temp = [] + # for bin_ping in range(1, n_bin_ping + 1): + # indices = np.argwhere(bin_ping_ind == bin_ping).flatten() + # temp.append(er_selected_data[indices, :]) + # + # binned_means.append(temp) + # + # return binned_means + + # binned_means = [] + # for bin_er in range(1, n_bin_er): + # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) + # # er_selected_data = arr[:, bin_er_ind == bin_er] + # + # binned_means.append(er_selected_data) + # + # temp = np.vstack(binned_means) + # + # temp2 = [] + # for bin_ping in range(1, n_bin_ping + 1): + # indices = np.argwhere(bin_ping_ind == bin_ping).flatten() + # temp2.append(10 * np.log10(np.nanmean(temp[:, indices], axis=1))) + + # return np.vstack(temp2) + + # return binned_means + binned_means = [] for bin_er in range(1, n_bin_er): - # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) - - # er_selected_data = arr[:, bin_er_ind == bin_er] er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) - # - # binned_means.append(er_selected_data) data_rows = [] # TODO: rename? for bin_ping in range(1, n_bin_ping + 1): - # data_rows.append(np.nanmean(er_selected_data[bin_ping_ind == bin_ping])) - # data_rows.append(er_selected_data[bin_ping_ind == bin_ping, :]) data_rows.append(er_selected_data[bin_ping_ind == bin_ping]) - # - # binned_means.append(data_rows) + temp = np.concatenate(data_rows, axis=0) - temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk('auto') #[None, None], chunks=(1, 1)) + temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk("auto") binned_means.append(temp) - stacked_means = np.transpose(np.vstack(binned_means)).rechunk('auto') - # return binned_means + stacked_means = np.transpose(np.vstack(binned_means)).rechunk("auto") return 10 * np.log10(stacked_means) # binned_means = [] @@ -425,7 +448,7 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): # resample(ping_time='20s', skipna=True).groups.keys())) all_MVBS = [] - for chan in ds_Sv.channel[:1]: + for chan in ds_Sv.channel: # squeeze to remove "channel" dim if present # TODO: not sure why not already removed for the AZFP case. Investigate. @@ -459,5 +482,4 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): ) ) - # return all_MVBS return np.stack(all_MVBS, axis=0) From fd91e3abb41a90c18542ef01cc8b6431e5f630f8 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 19 Oct 2022 13:26:25 -0700 Subject: [PATCH 06/28] add new approach that computes mean binned echorange values and then bins and means ping_time --- echopype/preprocess/api.py | 60 ++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 3cebcbe04..67a2451f7 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -372,19 +372,35 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range): n_bin_ping = len(bins_ping) n_bin_er = len(bins_er) + binned_means = [] + for bin_er in range(1, n_bin_er): + er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) + + binned_means.append(er_selected_data) + + er_means = np.vstack(binned_means).compute() + + final = np.empty((n_bin_ping, n_bin_er - 1)) + for bin_ping in range(1, n_bin_ping + 1): + indices = np.argwhere(bin_ping_ind == bin_ping).flatten() + final[bin_ping - 1, :] = 10 * np.log10(np.nanmean(er_means[:, indices], axis=1)) + + return final + # # binned_means = [] # for bin_er in range(1, n_bin_er): - # # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) - # er_selected_data = arr[:, bin_er_ind == bin_er] + # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) # - # temp = [] - # for bin_ping in range(1, n_bin_ping + 1): - # indices = np.argwhere(bin_ping_ind == bin_ping).flatten() - # temp.append(er_selected_data[indices, :]) + # binned_means.append(er_selected_data) # - # binned_means.append(temp) + # er_means = np.vstack(binned_means) # - # return binned_means + # final_list = [] + # for bin_ping in range(1, n_bin_ping + 1): + # indices = np.argwhere(bin_ping_ind == bin_ping).flatten() + # final_list.append(10 * np.log10(np.nanmean(er_means[:, indices], axis=1))) + # + # return np.vstack(final_list) # binned_means = [] # for bin_er in range(1, n_bin_er): @@ -404,20 +420,20 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range): # return binned_means - binned_means = [] - for bin_er in range(1, n_bin_er): - er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) - - data_rows = [] # TODO: rename? - for bin_ping in range(1, n_bin_ping + 1): - data_rows.append(er_selected_data[bin_ping_ind == bin_ping]) - - temp = np.concatenate(data_rows, axis=0) - temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk("auto") - binned_means.append(temp) - - stacked_means = np.transpose(np.vstack(binned_means)).rechunk("auto") - return 10 * np.log10(stacked_means) + # binned_means = [] + # for bin_er in range(1, n_bin_er): + # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) + # + # data_rows = [] # TODO: rename? + # for bin_ping in range(1, n_bin_ping + 1): + # data_rows.append(er_selected_data[bin_ping_ind == bin_ping]) + # + # temp = np.concatenate(data_rows, axis=0) + # temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk("auto") + # binned_means.append(temp) + # + # stacked_means = np.transpose(np.vstack(binned_means)).rechunk("auto") + # return 10 * np.log10(stacked_means) # binned_means = [] # for bin_ping in range(1, n_bin_ping + 1): From 9b85afbd1abaa46d3810c50775576c954ec4caca Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 20 Oct 2022 09:55:32 -0700 Subject: [PATCH 07/28] remove commented out lines and add a function for bin and meaning the times --- echopype/preprocess/api.py | 134 +++++++++++++------------------------ 1 file changed, 46 insertions(+), 88 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 67a2451f7..b594ae5bb 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -332,7 +332,7 @@ def regrid(): return 1 -def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range): +def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range): """ Bins and means a 2D array @@ -351,109 +351,67 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range): # TODO: put in check to make sure arr is 2D - # turn datetime into integers, so we can use np.digitize - if isinstance(pings, dask.array.Array): - pings_i8 = pings.compute().data.view("i8") - else: - pings_i8 = pings.view("i8") - - # turn datetime into integers, so we can use np.digitize - bins_ping_i8 = bins_ping.view("i8") - - # get bin index for each ping - bin_ping_ind = np.digitize(pings_i8, bins_ping_i8, right=False) - # get bin index for each echo_range value if isinstance(echo_range, dask.array.Array): bin_er_ind = np.digitize(echo_range.compute(), bins_er, right=False) else: bin_er_ind = np.digitize(echo_range, bins_er, right=False) - n_bin_ping = len(bins_ping) n_bin_er = len(bins_er) binned_means = [] for bin_er in range(1, n_bin_er): + er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) binned_means.append(er_selected_data) er_means = np.vstack(binned_means).compute() - final = np.empty((n_bin_ping, n_bin_er - 1)) - for bin_ping in range(1, n_bin_ping + 1): - indices = np.argwhere(bin_ping_ind == bin_ping).flatten() - final[bin_ping - 1, :] = 10 * np.log10(np.nanmean(er_means[:, indices], axis=1)) + # turn datetime into integers, so we can use np.digitize + if isinstance(times, dask.array.Array): + times_i8 = times.compute().data.view("i8") + else: + times_i8 = times.view("i8") + + # turn datetime into integers, so we can use np.digitize + bins_time_i8 = bins_time.view("i8") + + final = bin_and_mean_times(er_means, bins_time_i8, times_i8) + + return final + + +def bin_and_mean_times(er_means, bins_time_i8, times_i8): + """ + + + Parameters + ---------- + er_means + bins_time + times + + Returns + ------- + + Notes + ----- + This function assumes that ``times`` corresponds to the + columns of ``er_means``. + """ + + # get bin index for each time + bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False) + + n_bin_time = len(bins_time_i8) + + final = np.empty((n_bin_time, er_means.shape[0])) + for bin_time in range(1, n_bin_time + 1): + indices = np.argwhere(bin_time_ind == bin_time).flatten() + final[bin_time - 1, :] = 10 * np.log10(np.nanmean(er_means[:, indices], axis=1)) return final - # - # binned_means = [] - # for bin_er in range(1, n_bin_er): - # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) - # - # binned_means.append(er_selected_data) - # - # er_means = np.vstack(binned_means) - # - # final_list = [] - # for bin_ping in range(1, n_bin_ping + 1): - # indices = np.argwhere(bin_ping_ind == bin_ping).flatten() - # final_list.append(10 * np.log10(np.nanmean(er_means[:, indices], axis=1))) - # - # return np.vstack(final_list) - - # binned_means = [] - # for bin_er in range(1, n_bin_er): - # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) - # # er_selected_data = arr[:, bin_er_ind == bin_er] - # - # binned_means.append(er_selected_data) - # - # temp = np.vstack(binned_means) - # - # temp2 = [] - # for bin_ping in range(1, n_bin_ping + 1): - # indices = np.argwhere(bin_ping_ind == bin_ping).flatten() - # temp2.append(10 * np.log10(np.nanmean(temp[:, indices], axis=1))) - - # return np.vstack(temp2) - - # return binned_means - - # binned_means = [] - # for bin_er in range(1, n_bin_er): - # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) - # - # data_rows = [] # TODO: rename? - # for bin_ping in range(1, n_bin_ping + 1): - # data_rows.append(er_selected_data[bin_ping_ind == bin_ping]) - # - # temp = np.concatenate(data_rows, axis=0) - # temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk("auto") - # binned_means.append(temp) - # - # stacked_means = np.transpose(np.vstack(binned_means)).rechunk("auto") - # return 10 * np.log10(stacked_means) - - # binned_means = [] - # for bin_ping in range(1, n_bin_ping + 1): - # - # # ping_selected_data = arr[bin_ping_ind == bin_ping, :] - # ping_selected_data = np.nanmean(arr[bin_ping_ind == bin_ping, :], axis=0) - # - # data_rows = [] - # # TODO: can we do a mapping here, instead of a for loop? - # for bin_er in range(1, n_bin_er): - # # data_rows.append(np.nanmean(ping_selected_data[:, bin_er_ind == bin_er])) - # data_rows.append(np.nanmean(ping_selected_data[bin_er_ind == bin_er])) - # - # # TODO:: can we use something besides stack that is more efficient? - # binned_means.append(np.stack(data_rows, axis=0)) - - # if isinstance(arr, dask.array.Array): - # return 10 * np.log10(np.transpose(np.stack(binned_means, axis=0))) #.rechunk("auto"))) - # else: - # return 10 * np.log10(np.transpose(np.stack(binned_means, axis=0))) def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): @@ -491,9 +449,9 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): all_MVBS.append( bin_and_mean_2d( sv.data, - bins_ping=ping_interval, + bins_time=ping_interval, bins_er=range_interval, - pings=sv.ping_time.data, + times=sv.ping_time.data, echo_range=sv.echo_range.data, ) ) From 44adc23c15c5b944f71aa85ba822206ace3579a7 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 3 Nov 2022 17:06:12 -0700 Subject: [PATCH 08/28] start creating a bin and mean 2d method that accounts for different echo_range values for each ping time --- echopype/preprocess/api.py | 150 +++++++++++++++++-- echopype/tests/preprocess/test_preprocess.py | 13 +- 2 files changed, 144 insertions(+), 19 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index b594ae5bb..e84e8ae69 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -2,6 +2,8 @@ Functions for enhancing the spatial and temporal coherence of data. """ +from typing import Tuple + import dask.array import numpy as np import pandas as pd @@ -382,6 +384,115 @@ def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range): return final +def get_bin_indices( + echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray +) -> Tuple[np.ndarray, np.ndarray]: + """ + Obtains the bin index of ``echo_range`` and ``times`` based + on the binning ``bins_er`` and ``bins_time``, respectively. + + Parameters + ---------- + echo_range: np.ndarray + 2D array of echo range values + bins_er: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``echo_range`` + times: np.ndarray + 1D array corresponding to the time values that should be binned + bins_time: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``times`` + + Returns + ------- + digitized_echo_range: np.ndarray + 2D array of bin indices for ``echo_range`` + bin_time_ind: np.ndarray + 1D array of bin indices for ``times`` + """ + + # get bin index for each echo range value + digitized_echo_range = np.digitize(echo_range, bins_er, right=False) + + # turn datetime into integers, so we can use np.digitize + if isinstance(times, dask.array.Array): + times_i8 = times.compute().data.view("i8") + else: + times_i8 = times.view("i8") + + # turn datetime into integers, so we can use np.digitize + bins_time_i8 = bins_time.view("i8") + + # get bin index for each time + bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False) + + return digitized_echo_range, bin_time_ind + + +def mean_temp_arr(n_bin_er, temp_arr, dig_er_subset): + + means = [] + for bin_er in range(1, n_bin_er): + means.append(10 * np.log10(np.nanmean(temp_arr[dig_er_subset == bin_er], axis=0))) + + return means + + +def bin_and_mean_2d_v2(arr, bins_time, bins_er, times, echo_range): + """ + + + Returns + ------- + + + Notes + ----- + This function assumes that ``arr`` has rows corresponding to + ping time and columns corresponding to echo range. + """ + + # determine if array to bin is lazy or not + is_lazy = False + if isinstance(arr, dask.array.Array): + is_lazy = True + + print(f"is_lazy = {is_lazy}") + + # get the number of echo range and time bins + n_bin_er = len(bins_er) + n_bin_time = len(bins_time) + + # obtain the bin indices for echo_range and times + digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) + + all_means = [] + for bin_time in range(1, n_bin_time + 1): + + # get the indices of time in bin index bin_time + indices_time = np.argwhere(bin_time_ind == bin_time).flatten() + + # select only those array values that are in the time bin being considered + temp_arr = arr[indices_time, :] + + if is_lazy: + all_means.append( + dask.delayed(mean_temp_arr)( + n_bin_er, temp_arr, digitized_echo_range[indices_time, :] + ) + ) + else: + all_means.append( + mean_temp_arr(n_bin_er, temp_arr, digitized_echo_range[indices_time, :]) + ) + + if is_lazy: + all_means = dask.compute(all_means)[0] # TODO: make this into persist in the future? + + final_reduced = np.array(all_means) + + return final_reduced + + def bin_and_mean_times(er_means, bins_time_i8, times_i8): """ @@ -421,6 +532,8 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): # ping_interval = np.array(list(ds_Sv.ping_time. # resample(ping_time='20s', skipna=True).groups.keys())) + print("account for uneven echo range ") + all_MVBS = [] for chan in ds_Sv.channel: @@ -431,28 +544,37 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): # average should be done in linear domain sv = 10 ** (ds["Sv"] / 10) - # set 1D coordinate using the 1st ping echo_range since identical for all pings - # remove padded NaN entries if exist for all pings - er = ( - ds["echo_range"] - .dropna(dim="range_sample", how="all") - .dropna(dim="ping_time") - .isel(ping_time=0) - ) + # # set 1D coordinate using the 1st ping echo_range since identical for all pings + # # remove padded NaN entries if exist for all pings + # er = ( + # ds["echo_range"] + # .dropna(dim="range_sample", how="all") + # .dropna(dim="ping_time") + # .isel(ping_time=0) + # ) - # use first ping er as indexer for sv - sv = sv.sel(range_sample=er.range_sample.values) - sv.coords["echo_range"] = (["range_sample"], er.values) - sv = sv.swap_dims({"range_sample": "echo_range"}) + # # use first ping er as indexer for sv + # sv = sv.sel(range_sample=er.range_sample.values) + # sv.coords["echo_range"] = (["range_sample"], er.values) + # sv = sv.swap_dims({"range_sample": "echo_range"}) # collect the MVBS calculated for the channel + # all_MVBS.append( + # bin_and_mean_2d( + # sv.data, + # bins_time=ping_interval, + # bins_er=range_interval, + # times=sv.ping_time.data, + # echo_range=sv.echo_range.data, + # ) + # ) all_MVBS.append( - bin_and_mean_2d( + bin_and_mean_2d_v2( sv.data, bins_time=ping_interval, bins_er=range_interval, times=sv.ping_time.data, - echo_range=sv.echo_range.data, + echo_range=ds["echo_range"].data, ) ) diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index e2876a6d7..4e4c64d20 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -542,21 +542,24 @@ def create_known_mean_data(create_dask, num_pings_in_bin, num_er_in_bin): for count, bin_val in enumerate(er_bins): if create_dask: - a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[count], ping_bin)) + a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_er_in_bin[count],)) else: - a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[count], ping_bin)) + a = np.random.uniform(bin_val[0], bin_val[1], (num_er_in_bin[count],)) temp.append(a) means.append([arr.mean() for arr in temp]) arrays.append(np.concatenate(temp, axis=0)) + echo_range = np.linspace(0, er_csum.max(), num=er_csum.max() + 1, dtype=np.int64) + if create_dask: means = dask.array.from_array(means).compute() + echo_range = dask.array.from_array(echo_range) full_array = np.concatenate(arrays, axis=1) - return means, full_array, ping_bins, er_bins + return means, full_array, ping_bins, er_bins, echo_range def test_bin_and_mean_2d(): @@ -567,7 +570,7 @@ def test_bin_and_mean_2d(): num_pings_in_bin = np.random.randint(low=10, high=1000, size=10) num_er_in_bin = np.random.randint(low=10, high=1000, size=10) - means, full_array, ping_bins, er_bins = create_known_mean_data(create_dask, + means, full_array, ping_bins, er_bins, echo_range = create_known_mean_data(create_dask, num_pings_in_bin, num_er_in_bin) ping_times = np.concatenate([np.random.uniform(bin_val[0], bin_val[1], num_pings_in_bin[count]) @@ -576,7 +579,7 @@ def test_bin_and_mean_2d(): digitize_ping_bin = [*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:]] digitize_er_bin = [*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]] - calc_means = bin_and_mean_2d(full_array, digitize_ping_bin, digitize_er_bin, ping_times) + calc_means = bin_and_mean_2d(full_array, digitize_ping_bin, digitize_er_bin, ping_times, echo_range) if create_dask: calc_means = calc_means.compute() From a79dae459660e0895504aefb83bce4f0ef0376cc Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 4 Nov 2022 11:07:06 -0700 Subject: [PATCH 09/28] replace previous bin and mean method with general method, construct mock sv values, and test general method against mock values --- echopype/preprocess/api.py | 131 +++---------------- echopype/tests/preprocess/test_preprocess.py | 115 +++++++++++----- 2 files changed, 99 insertions(+), 147 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index e84e8ae69..fb4b17072 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -334,56 +334,6 @@ def regrid(): return 1 -def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range): - """ - Bins and means a 2D array - - - bins_ping, bins_er are bins in np.digitize format - - TODO: document! - - Notes - ----- - This function assumes that values in ``arr`` are not outside - of the bins presecribed by ``bins_er``. Additionally, it also - assumes that values in ``pings`` are not outside the bins given - by bins_ping. - """ - - # TODO: put in check to make sure arr is 2D - - # get bin index for each echo_range value - if isinstance(echo_range, dask.array.Array): - bin_er_ind = np.digitize(echo_range.compute(), bins_er, right=False) - else: - bin_er_ind = np.digitize(echo_range, bins_er, right=False) - - n_bin_er = len(bins_er) - - binned_means = [] - for bin_er in range(1, n_bin_er): - - er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1) - - binned_means.append(er_selected_data) - - er_means = np.vstack(binned_means).compute() - - # turn datetime into integers, so we can use np.digitize - if isinstance(times, dask.array.Array): - times_i8 = times.compute().data.view("i8") - else: - times_i8 = times.view("i8") - - # turn datetime into integers, so we can use np.digitize - bins_time_i8 = bins_time.view("i8") - - final = bin_and_mean_times(er_means, bins_time_i8, times_i8) - - return final - - def get_bin_indices( echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray ) -> Tuple[np.ndarray, np.ndarray]: @@ -429,15 +379,16 @@ def get_bin_indices( def mean_temp_arr(n_bin_er, temp_arr, dig_er_subset): + # TODO: document means = [] for bin_er in range(1, n_bin_er): - means.append(10 * np.log10(np.nanmean(temp_arr[dig_er_subset == bin_er], axis=0))) + means.append(np.nanmean(temp_arr[dig_er_subset == bin_er], axis=0)) return means -def bin_and_mean_2d_v2(arr, bins_time, bins_er, times, echo_range): +def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range): """ @@ -451,6 +402,8 @@ def bin_and_mean_2d_v2(arr, bins_time, bins_er, times, echo_range): ping time and columns corresponding to echo range. """ + # TODO: document + # determine if array to bin is lazy or not is_lazy = False if isinstance(arr, dask.array.Array): @@ -493,38 +446,6 @@ def bin_and_mean_2d_v2(arr, bins_time, bins_er, times, echo_range): return final_reduced -def bin_and_mean_times(er_means, bins_time_i8, times_i8): - """ - - - Parameters - ---------- - er_means - bins_time - times - - Returns - ------- - - Notes - ----- - This function assumes that ``times`` corresponds to the - columns of ``er_means``. - """ - - # get bin index for each time - bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False) - - n_bin_time = len(bins_time_i8) - - final = np.empty((n_bin_time, er_means.shape[0])) - for bin_time in range(1, n_bin_time + 1): - indices = np.argwhere(bin_time_ind == bin_time).flatten() - final[bin_time - 1, :] = 10 * np.log10(np.nanmean(er_means[:, indices], axis=1)) - - return final - - def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): # range_meter_bin = 5 @@ -544,38 +465,16 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): # average should be done in linear domain sv = 10 ** (ds["Sv"] / 10) - # # set 1D coordinate using the 1st ping echo_range since identical for all pings - # # remove padded NaN entries if exist for all pings - # er = ( - # ds["echo_range"] - # .dropna(dim="range_sample", how="all") - # .dropna(dim="ping_time") - # .isel(ping_time=0) - # ) - - # # use first ping er as indexer for sv - # sv = sv.sel(range_sample=er.range_sample.values) - # sv.coords["echo_range"] = (["range_sample"], er.values) - # sv = sv.swap_dims({"range_sample": "echo_range"}) - - # collect the MVBS calculated for the channel - # all_MVBS.append( - # bin_and_mean_2d( - # sv.data, - # bins_time=ping_interval, - # bins_er=range_interval, - # times=sv.ping_time.data, - # echo_range=sv.echo_range.data, - # ) - # ) - all_MVBS.append( - bin_and_mean_2d_v2( - sv.data, - bins_time=ping_interval, - bins_er=range_interval, - times=sv.ping_time.data, - echo_range=ds["echo_range"].data, - ) + # get MVBS for channel in linear domain + chan_MVBS = bin_and_mean_2d( + sv.data, + bins_time=ping_interval, + bins_er=range_interval, + times=sv.ping_time.data, + echo_range=ds["echo_range"].data, ) + # apply inverse mapping to get back to the original domain and store values + all_MVBS.append(10 * np.log10(chan_MVBS)) + return np.stack(all_MVBS, axis=0) diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index 4e4c64d20..1b9f5dd85 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -525,64 +525,117 @@ def create_bins(csum_array): return bins -def create_known_mean_data(create_dask, num_pings_in_bin, num_er_in_bin): - # TODO: document! +def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range): + + # TODO: document and split up function into parts + + num_pings_in_bin = np.random.randint(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins) + # create bins for ping_time dimension ping_csum = np.cumsum(num_pings_in_bin) ping_bins = create_bins(ping_csum) + # create bins for echo_range dimension + num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins) er_csum = np.cumsum(num_er_in_bin) er_bins = create_bins(er_csum) - arrays = [] - means = [] - for ping_bin in num_pings_in_bin: + # build echo_range array + final_arrays = [] + all_er_bin_nums = [] + ping_times_in_bin = [] + for ping_ind, ping_bin in enumerate(ping_bins): + + ping_times_in_bin.append(np.random.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],))) + + # randomly determine the number of values in each echo_range bin + num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins) - temp = [] + # store the number of values in each echo_range bin + all_er_bin_nums.append(num_er_in_bin) + + er_row_block = [] for count, bin_val in enumerate(er_bins): + # create a block of echo_range values + a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], + num_er_in_bin[count])) + + # store the block of echo_range values + er_row_block.append(a) + + # collect and construct echo_range row block + final_arrays.append(np.concatenate(er_row_block, axis=1)) - if create_dask: - a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_er_in_bin[count],)) - else: - a = np.random.uniform(bin_val[0], bin_val[1], (num_er_in_bin[count],)) + # get final ping_time dimension + final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]') - temp.append(a) + # get maximum number of echo_range elements amongst all times + max_num_er_elem = max([arr.shape[1] for arr in final_arrays]) - means.append([arr.mean() for arr in temp]) - arrays.append(np.concatenate(temp, axis=0)) + # total number of ping times + tot_num_times = ping_csum[-1] - echo_range = np.linspace(0, er_csum.max(), num=er_csum.max() + 1, dtype=np.int64) + # pad echo_range dimension with nans and create final echo_range + final_er = np.empty((tot_num_times, max_num_er_elem)) + final_er[:] = np.nan + for count, arr in enumerate(final_arrays): - if create_dask: - means = dask.array.from_array(means).compute() - echo_range = dask.array.from_array(echo_range) + if count == 0: + final_er[0:ping_csum[count], 0:arr.shape[1]] = arr + else: + final_er[ping_csum[count - 1]:ping_csum[count], 0:arr.shape[1]] = arr - full_array = np.concatenate(arrays, axis=1) + # pad echo_range dimension with nans and create final sv + final_sv = np.empty((tot_num_times, max_num_er_elem)) + final_sv[:] = np.nan - return means, full_array, ping_bins, er_bins, echo_range + final_means = [] + for count, arr in enumerate(all_er_bin_nums): + # create sv row values using natural numbers + sv_row_list = [np.arange(1, num_elem + 1, 1) for num_elem in arr] + + # create final sv row + sv_row = np.concatenate(sv_row_list) + + # get final mean which is n+1/2 (since we are using natural numbers) + final_means.append([(len(elem) + 1) / 2.0 for elem in sv_row_list]) + + # create sv row block + sv_row_block = np.tile(sv_row, (num_pings_in_bin[count], 1)) + + if count == 0: + final_sv[0:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block + else: + final_sv[ping_csum[count - 1]:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block + + # create final sv MVBS + final_MVBS = np.vstack(final_means) + + return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time def test_bin_and_mean_2d(): + # TODO: create lazy option for sv and echo_range + # TODO: document and create a fixture with input of create_dask create_dask = True - num_pings_in_bin = np.random.randint(low=10, high=1000, size=10) - num_er_in_bin = np.random.randint(low=10, high=1000, size=10) + final_num_ping_bins = 2 + final_num_er_bins = 5 - means, full_array, ping_bins, er_bins, echo_range = create_known_mean_data(create_dask, - num_pings_in_bin, num_er_in_bin) + ping_range = [1, 5] + er_range = [1, 5] - ping_times = np.concatenate([np.random.uniform(bin_val[0], bin_val[1], num_pings_in_bin[count]) - for count, bin_val in enumerate(ping_bins)]) + known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range) - digitize_ping_bin = [*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:]] - digitize_er_bin = [*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]] + digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]]) + digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]) + digitize_ping_bin = digitize_ping_bin.astype('datetime64[ns]') - calc_means = bin_and_mean_2d(full_array, digitize_ping_bin, digitize_er_bin, ping_times, echo_range) + calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin, + bins_er=digitize_er_bin, times=final_ping_time, echo_range=final_er) - if create_dask: - calc_means = calc_means.compute() + assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10) - assert np.allclose(calc_means, means, atol=1e-10, rtol=1e-10) From 2a7629aa6f3706b7968f43c75e262416af671c4e Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 4 Nov 2022 14:57:24 -0700 Subject: [PATCH 10/28] document and clean up code associated with creating the mock data set for MVBS testing --- echopype/tests/preprocess/test_preprocess.py | 226 +++++++++++++++++-- 1 file changed, 202 insertions(+), 24 deletions(-) diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index 1b9f5dd85..9a7109a62 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -6,6 +6,7 @@ import dask.array from echopype.preprocess.api import bin_and_mean_2d +from typing import List, Tuple, Iterable @pytest.fixture( @@ -512,40 +513,80 @@ def test_preprocess_mvbs(test_data_samples): assert ep.preprocess.compute_MVBS(Sv) is not None -def create_bins(csum_array): - # TODO: document! +def create_bins(csum_array: np.ndarray) -> Iterable: + """ + Constructs bin ranges based off of a cumulative + sum array. + + Parameters + ---------- + csum_array: np.ndarray + 1D array representing a cumulative sum + + Returns + ------- + bins: list + A list whose elements are the lower and upper bin ranges + """ + bins = [] + + # construct bins for count, csum in enumerate(csum_array): if count == 0: - bins.append((0, csum)) + + bins.append([0, csum]) + else: - # add 0.01 so that left bins don't overlap - bins.append((csum_array[count-1] + 0.01, csum)) - return bins + # add 0.01 so that left bins don't overlap + bins.append([csum_array[count-1] + 0.01, csum]) -def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range): + return bins - # TODO: document and split up function into parts - num_pings_in_bin = np.random.randint(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins) +def create_echo_range_related_data(ping_bins: Iterable, + num_pings_in_bin: np.ndarray, + er_range: list, er_bins: Iterable, + final_num_er_bins: int) -> Tuple[list, list, list]: + """ + Creates ``echo_range`` values and associated bin information. - # create bins for ping_time dimension - ping_csum = np.cumsum(num_pings_in_bin) - ping_bins = create_bins(ping_csum) + Parameters + ---------- + ping_bins: list + A list whose elements are the lower and upper ping time bin ranges + num_pings_in_bin: np.ndarray + Specifies the number of pings in each ping time bin + er_range: list + A list whose first element is the lowest and second element is + the highest possible number of echo range values in a given bin + er_bins: list + A list whose elements are the lower and upper echo range bin ranges + final_num_er_bins: int + The total number of echo range bins - # create bins for echo_range dimension - num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins) - er_csum = np.cumsum(num_er_in_bin) - er_bins = create_bins(er_csum) + Returns + ------- + all_er_bin_nums: list of np.ndarray + A list whose elements are the number of values in each echo_range + bin, for each ping bin + ping_times_in_bin: list of np.ndarray + A list whose elements are the ping_time values for each corresponding bin + final_arrays: list of np.ndarray + A list whose elements are the echo_range values for a given ping and + echo range bin block + """ - # build echo_range array final_arrays = [] all_er_bin_nums = [] ping_times_in_bin = [] + + # build echo_range array for ping_ind, ping_bin in enumerate(ping_bins): + # create the ping times associated with each ping bin ping_times_in_bin.append(np.random.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],))) # randomly determine the number of values in each echo_range bin @@ -566,8 +607,30 @@ def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, e # collect and construct echo_range row block final_arrays.append(np.concatenate(er_row_block, axis=1)) - # get final ping_time dimension - final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]') + return all_er_bin_nums, ping_times_in_bin, final_arrays + + +def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], + ping_csum: np.ndarray) -> Tuple[np.ndarray, int]: + """ + Creates the final 2D ``echo_range`` array with appropriate padding. + + Parameters + ---------- + final_arrays: list of np.ndarray + A list whose elements are the echo_range values for a given ping and + echo range bin block + ping_csum: np.ndarray + 1D array representing the cumulative sum for the number of ping times + in each ping bin + + Returns + ------- + final_er: np.ndarray + The final 2D ``echo_range`` array + max_num_er_elem: int + The maximum number of ``echo_range`` elements amongst all times + """ # get maximum number of echo_range elements amongst all times max_num_er_elem = max([arr.shape[1] for arr in final_arrays]) @@ -585,6 +648,39 @@ def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, e else: final_er[ping_csum[count - 1]:ping_csum[count], 0:arr.shape[1]] = arr + return final_er, max_num_er_elem + + +def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, + all_er_bin_nums: Iterable[np.ndarray], + num_pings_in_bin: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """ + Creates the final 2D Sv array with appropriate padding. + + Parameters + ---------- + max_num_er_elem: int + The maximum number of ``echo_range`` elements amongst all times + ping_csum: np.ndarray + 1D array representing the cumulative sum for the number of ping times + in each ping bin + all_er_bin_nums: list of np.ndarray + A list whose elements are the number of values in each echo_range + bin, for each ping bin + num_pings_in_bin: np.ndarray + Specifies the number of pings in each ping time bin + + Returns + ------- + final_sv: np.ndarray + The final 2D Sv array + final_MVBS: np.ndarray + The final 2D known MVBS array + """ + + # total number of ping times + tot_num_times = ping_csum[-1] + # pad echo_range dimension with nans and create final sv final_sv = np.empty((tot_num_times, max_num_er_elem)) final_sv[:] = np.nan @@ -612,9 +708,85 @@ def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, e # create final sv MVBS final_MVBS = np.vstack(final_means) + return final_sv, final_MVBS + + +def create_known_mean_data(final_num_ping_bins: int, + final_num_er_bins: int, + ping_range: list, + er_range: list) -> Tuple[np.ndarray, np.ndarray, Iterable, + Iterable, np.ndarray, np.ndarray]: + """ + Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays + where the MVBS is known. + + Parameters + ---------- + final_num_ping_bins: int + The total number of ping time bins + final_num_er_bins: int + The total number of echo range bins + ping_range: list + A list whose first element is the lowest and second element is + the highest possible number of ping time values in a given bin + er_range: list + A list whose first element is the lowest and second element is + the highest possible number of echo range values in a given bin + + Returns + ------- + final_MVBS: np.ndarray + The final 2D known MVBS array + final_sv: np.ndarray + The final 2D Sv array + ping_bins: Iterable + A list whose elements are the lower and upper ping time bin ranges + er_bins: Iterable + A list whose elements are the lower and upper echo range bin ranges + final_er: np.ndarray + The final 2D ``echo_range`` array + final_ping_time: np.ndarray + The final 1D ``ping_time`` array + """ + + # randomly generate the number of pings in each ping bin + num_pings_in_bin = np.random.randint(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins) + + # create bins for ping_time dimension + ping_csum = np.cumsum(num_pings_in_bin) + ping_bins = create_bins(ping_csum) + + # create bins for echo_range dimension + num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins) + er_csum = np.cumsum(num_er_in_bin) + er_bins = create_bins(er_csum) + + # create the echo_range data and associated bin information + all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin, + er_range, er_bins, + final_num_er_bins) + + # create the final echo_range array using created data and padding + final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum) + + # get final ping_time dimension + final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]') + + # create the final sv array + final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, all_er_bin_nums, num_pings_in_bin) + return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time -def test_bin_and_mean_2d(): + +def test_bin_and_mean_2d() -> None: + """ + Tests the function ``bin_and_mean_2d``, which is the core + method for ``compute_MVBS``. This is done by creating mock + data (which can have varying number of ``echo_range`` values + for each ``ping_time``) with known means. + """ + + # TODO: create lazy option for sv and echo_range # TODO: document and create a fixture with input of create_dask @@ -627,15 +799,21 @@ def test_bin_and_mean_2d(): ping_range = [1, 5] er_range = [1, 5] - known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range) + # create echo_range, ping_time, and Sv arrays where the MVBS is known + known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, + final_num_er_bins, + ping_range, er_range) + # put the created ping bins into a form that works with bin_and_mean_2d digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]]) - digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]) digitize_ping_bin = digitize_ping_bin.astype('datetime64[ns]') + # put the created echo range bins into a form that works with bin_and_mean_2d + digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]) + + # calculate MVBS for mock data set calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin, bins_er=digitize_er_bin, times=final_ping_time, echo_range=final_er) + # compare known MVBS solution against its calculated counterpart assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10) - - From 86a14914cfab071b6eb1a063efe437bfbae002cf Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 4 Nov 2022 15:26:25 -0700 Subject: [PATCH 11/28] allow Sv and echo_range arrays to be dask arrays in mock data --- echopype/tests/preprocess/test_preprocess.py | 76 ++++++++++++++------ 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index 9a7109a62..fc468801d 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -6,7 +6,7 @@ import dask.array from echopype.preprocess.api import bin_and_mean_2d -from typing import List, Tuple, Iterable +from typing import List, Tuple, Iterable, Union @pytest.fixture( @@ -549,7 +549,8 @@ def create_bins(csum_array: np.ndarray) -> Iterable: def create_echo_range_related_data(ping_bins: Iterable, num_pings_in_bin: np.ndarray, er_range: list, er_bins: Iterable, - final_num_er_bins: int) -> Tuple[list, list, list]: + final_num_er_bins: int, + create_dask: bool) -> Tuple[list, list, list]: """ Creates ``echo_range`` values and associated bin information. @@ -566,6 +567,9 @@ def create_echo_range_related_data(ping_bins: Iterable, A list whose elements are the lower and upper echo range bin ranges final_num_er_bins: int The total number of echo range bins + create_dask: bool + If True ``final_arrays`` values will be + dask arrays, else they will be numpy arrays Returns ------- @@ -574,7 +578,7 @@ def create_echo_range_related_data(ping_bins: Iterable, bin, for each ping bin ping_times_in_bin: list of np.ndarray A list whose elements are the ping_time values for each corresponding bin - final_arrays: list of np.ndarray + final_arrays: list of np.ndarray or dask.array.Array A list whose elements are the echo_range values for a given ping and echo range bin block """ @@ -597,9 +601,14 @@ def create_echo_range_related_data(ping_bins: Iterable, er_row_block = [] for count, bin_val in enumerate(er_bins): + # create a block of echo_range values - a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], - num_er_in_bin[count])) + if create_dask: + a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], + num_er_in_bin[count])) + else: + a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], + num_er_in_bin[count])) # store the block of echo_range values er_row_block.append(a) @@ -611,7 +620,8 @@ def create_echo_range_related_data(ping_bins: Iterable, def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], - ping_csum: np.ndarray) -> Tuple[np.ndarray, int]: + ping_csum: np.ndarray, + create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array], int]: """ Creates the final 2D ``echo_range`` array with appropriate padding. @@ -623,10 +633,12 @@ def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], ping_csum: np.ndarray 1D array representing the cumulative sum for the number of ping times in each ping bin + create_dask: bool + If True ``final_er`` will be a dask array, else it will be a numpy array Returns ------- - final_er: np.ndarray + final_er: np.ndarray or dask.array.Array The final 2D ``echo_range`` array max_num_er_elem: int The maximum number of ``echo_range`` elements amongst all times @@ -639,8 +651,12 @@ def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], tot_num_times = ping_csum[-1] # pad echo_range dimension with nans and create final echo_range - final_er = np.empty((tot_num_times, max_num_er_elem)) - final_er[:] = np.nan + if create_dask: + final_er = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan + else: + final_er = np.empty((tot_num_times, max_num_er_elem)) + final_er[:] = np.nan + for count, arr in enumerate(final_arrays): if count == 0: @@ -653,7 +669,9 @@ def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, all_er_bin_nums: Iterable[np.ndarray], - num_pings_in_bin: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + num_pings_in_bin: np.ndarray, + create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array], + np.ndarray]: """ Creates the final 2D Sv array with appropriate padding. @@ -669,10 +687,12 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, bin, for each ping bin num_pings_in_bin: np.ndarray Specifies the number of pings in each ping time bin + create_dask: bool + If True ``final_sv`` will be a dask array, else it will be a numpy array Returns ------- - final_sv: np.ndarray + final_sv: np.ndarray or dask.array.Array The final 2D Sv array final_MVBS: np.ndarray The final 2D known MVBS array @@ -682,8 +702,11 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, tot_num_times = ping_csum[-1] # pad echo_range dimension with nans and create final sv - final_sv = np.empty((tot_num_times, max_num_er_elem)) - final_sv[:] = np.nan + if create_dask: + final_sv = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan + else: + final_sv = np.empty((tot_num_times, max_num_er_elem)) + final_sv[:] = np.nan final_means = [] for count, arr in enumerate(all_er_bin_nums): @@ -714,8 +737,8 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, def create_known_mean_data(final_num_ping_bins: int, final_num_er_bins: int, ping_range: list, - er_range: list) -> Tuple[np.ndarray, np.ndarray, Iterable, - Iterable, np.ndarray, np.ndarray]: + er_range: list, create_dask: bool) -> Tuple[np.ndarray, np.ndarray, Iterable, + Iterable, np.ndarray, np.ndarray]: """ Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays where the MVBS is known. @@ -732,6 +755,9 @@ def create_known_mean_data(final_num_ping_bins: int, er_range: list A list whose first element is the lowest and second element is the highest possible number of echo range values in a given bin + create_dask: bool + If True the ``Sv`` and ``echo_range`` values produced will be + dask arrays, else they will be numpy arrays. Returns ------- @@ -764,16 +790,18 @@ def create_known_mean_data(final_num_ping_bins: int, # create the echo_range data and associated bin information all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin, er_range, er_bins, - final_num_er_bins) + final_num_er_bins, + create_dask) # create the final echo_range array using created data and padding - final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum) + final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum, create_dask) # get final ping_time dimension final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]') # create the final sv array - final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, all_er_bin_nums, num_pings_in_bin) + final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, + all_er_bin_nums, num_pings_in_bin, create_dask) return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time @@ -784,10 +812,13 @@ def test_bin_and_mean_2d() -> None: method for ``compute_MVBS``. This is done by creating mock data (which can have varying number of ``echo_range`` values for each ``ping_time``) with known means. - """ - - # TODO: create lazy option for sv and echo_range + Parameters + ---------- + create_dask: bool + If True the ``Sv`` and ``echo_range`` values produced will be + dask arrays, else they will be numpy arrays. + """ # TODO: document and create a fixture with input of create_dask @@ -802,7 +833,8 @@ def test_bin_and_mean_2d() -> None: # create echo_range, ping_time, and Sv arrays where the MVBS is known known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, final_num_er_bins, - ping_range, er_range) + ping_range, er_range, + create_dask) # put the created ping bins into a form that works with bin_and_mean_2d digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]]) From 49b374563e8c3f80e2c6d39dda7cd56f6f8b5425 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 4 Nov 2022 15:41:54 -0700 Subject: [PATCH 12/28] add pytest fixture to test_bin_and_mean_2d --- echopype/tests/preprocess/test_preprocess.py | 55 ++++++++++++++++---- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index fc468801d..7a6701795 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -6,7 +6,7 @@ import dask.array from echopype.preprocess.api import bin_and_mean_2d -from typing import List, Tuple, Iterable, Union +from typing import Tuple, Iterable, Union @pytest.fixture( @@ -806,7 +806,37 @@ def create_known_mean_data(final_num_ping_bins: int, return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time -def test_bin_and_mean_2d() -> None: +@pytest.fixture( + params=[ + { + "create_dask": True, + "final_num_ping_bins": 10, + "final_num_er_bins": 10, + "ping_range": [10, 1000], + "er_range": [10, 1000] + }, + { + "create_dask": False, + "final_num_ping_bins": 10, + "final_num_er_bins": 10, + "ping_range": [10, 1000], + "er_range": [10, 1000] + }, + ], + ids=[ + "delayed_data", + "in_memory_data" + ], +) +def bin_and_mean_2d_params(request): + """ + Obtains all necessary parameters for ``test_bin_and_mean_2d``. + """ + + return list(request.param.values()) + + +def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: """ Tests the function ``bin_and_mean_2d``, which is the core method for ``compute_MVBS``. This is done by creating mock @@ -818,17 +848,20 @@ def test_bin_and_mean_2d() -> None: create_dask: bool If True the ``Sv`` and ``echo_range`` values produced will be dask arrays, else they will be numpy arrays. + final_num_ping_bins: int + The total number of ping time bins + final_num_er_bins: int + The total number of echo range bins + ping_range: list + A list whose first element is the lowest and second element is + the highest possible number of ping time values in a given bin + er_range: list + A list whose first element is the lowest and second element is + the highest possible number of echo range values in a given bin """ - # TODO: document and create a fixture with input of create_dask - - create_dask = True - - final_num_ping_bins = 2 - final_num_er_bins = 5 - - ping_range = [1, 5] - er_range = [1, 5] + # get all parameters needed to create the mock data + create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params # create echo_range, ping_time, and Sv arrays where the MVBS is known known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, From ae12a6e8f5c7aacab460ef336bc78cf8d01ccc28 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Fri, 4 Nov 2022 16:32:10 -0700 Subject: [PATCH 13/28] begin documenting all functions needed to compute MVBS along a channel --- echopype/preprocess/api.py | 94 ++++++++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 13 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index fb4b17072..233233da7 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -2,7 +2,7 @@ Functions for enhancing the spatial and temporal coherence of data. """ -from typing import Tuple +from typing import List, Tuple, Union import dask.array import numpy as np @@ -378,8 +378,34 @@ def get_bin_indices( return digitized_echo_range, bin_time_ind -def mean_temp_arr(n_bin_er, temp_arr, dig_er_subset): - # TODO: document +def mean_temp_arr( + n_bin_er: int, + temp_arr: Union[dask.array.Array, np.ndarray], + dig_er_subset: Union[dask.array.Array, np.ndarray], +) -> List[Union[dask.array.Array, np.ndarray]]: + """ + Bins the data in ``temp_arr`` with respect to the + ``echo_range`` bin and means the resulting bin. + + Parameters + ---------- + n_bin_er: int + The number of echo range bins + temp_arr: dask.array.Array or np.ndarray + Array of Sv values at the ``ping_time`` bin index being considered + dig_er_subset: dask.array.Array or np.ndarray + Array representing the digitized (bin indices) for ``echo_range`` at + the ``ping_time`` bin index being considered + + Returns + ------- + means: list of dask.array.Array or np.ndarray + + Notes + ----- + It is necessary for this to be a function because we may need to + delay it. + """ means = [] for bin_er in range(1, n_bin_er): @@ -388,29 +414,50 @@ def mean_temp_arr(n_bin_er, temp_arr, dig_er_subset): return means -def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range): +def bin_and_mean_2d( + arr: Union[dask.array.Array, np.ndarray], + bins_time: np.ndarray, + bins_er: np.ndarray, + times: np.ndarray, + echo_range: np.ndarray, +) -> np.ndarray: """ + Bins and means ``arr`` based on ``times`` and ``echo_range``, + and their corresponding bins. If ``arr`` is ``Sv`` then this + will compute the MVBS. + Parameters + ---------- + arr: dask.array.Array or np.ndarray + The 2D array whose values should be binned + bins_time: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``times`` + bins_er: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``echo_range`` + times: np.ndarray + 1D array corresponding to the time values that should be binned + echo_range: np.ndarray + 2D array of echo range values Returns ------- - + final_reduced: np.ndarray + The final binned and meaned ``arr``, if ``arr`` is ``Sv`` then this is the MVBS Notes ----- This function assumes that ``arr`` has rows corresponding to - ping time and columns corresponding to echo range. - """ + ``ping_time`` and columns corresponding to ``echo_range``. - # TODO: document + This function allows the number of ``echo_range`` values to + vary amongst ``ping_times``. + """ # determine if array to bin is lazy or not is_lazy = False if isinstance(arr, dask.array.Array): is_lazy = True - print(f"is_lazy = {is_lazy}") - # get the number of echo range and time bins n_bin_er = len(bins_er) n_bin_time = len(bins_time) @@ -427,6 +474,7 @@ def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range): # select only those array values that are in the time bin being considered temp_arr = arr[indices_time, :] + # bin and mean with respect to echo_range bins if is_lazy: all_means.append( dask.delayed(mean_temp_arr)( @@ -439,22 +487,41 @@ def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range): ) if is_lazy: + # compute all constructs means all_means = dask.compute(all_means)[0] # TODO: make this into persist in the future? + # construct final reduced form of arr final_reduced = np.array(all_means) return final_reduced -def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): +def get_MVBS_along_channels( + ds_Sv: xr.Dataset, range_interval: np.ndarray, ping_interval: np.ndarray +): + """ + Computes the MVBS of ``ds_SV`` along each channel for the given + intervals. + + Parameters + ---------- + ds_Sv: xr.Dataset + + range_interval: np.ndarray + + ping_interval: np.ndarray + + + Returns + ------- + + """ # range_meter_bin = 5 # range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) # ping_interval = np.array(list(ds_Sv.ping_time. # resample(ping_time='20s', skipna=True).groups.keys())) - print("account for uneven echo range ") - all_MVBS = [] for chan in ds_Sv.channel: @@ -477,4 +544,5 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval): # apply inverse mapping to get back to the original domain and store values all_MVBS.append(10 * np.log10(chan_MVBS)) + # collect the MVBS values for each channel return np.stack(all_MVBS, axis=0) From 2a4afcd2c1e5540c0a7e9fee2e579f136f1dff0f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 7 Nov 2022 08:45:41 -0800 Subject: [PATCH 14/28] finish documenting get_MVBS_along_channels function --- echopype/preprocess/api.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 233233da7..c61b4c1ad 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -442,7 +442,7 @@ def bin_and_mean_2d( Returns ------- final_reduced: np.ndarray - The final binned and meaned ``arr``, if ``arr`` is ``Sv`` then this is the MVBS + The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS Notes ----- @@ -497,31 +497,36 @@ def bin_and_mean_2d( def get_MVBS_along_channels( - ds_Sv: xr.Dataset, range_interval: np.ndarray, ping_interval: np.ndarray -): + ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray +) -> np.ndarray: """ - Computes the MVBS of ``ds_SV`` along each channel for the given + Computes the MVBS of ``ds_Sv`` along each channel for the given intervals. Parameters ---------- ds_Sv: xr.Dataset - - range_interval: np.ndarray - + A Dataset containing ``Sv`` and ``echo_range`` data with coordinates + ``channel``, ``ping_time``, and ``range_sample`` + echo_range_interval: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``echo_range`` ping_interval: np.ndarray - + 1D array (used by np.digitize) representing the binning required for ``ping_time`` Returns ------- + np.ndarray + The MVBS value of the input ``ds_Sv`` for all channels + Notes + ----- + If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with + respect to ``echo_range`` will take place, then the binning and mean with respect to + ``ping_time`` will be a delayed operation, and lastly the delayed values will be + computed. It is necessary to apply a compute at the end of this method because Dask + graph layers get too large and this makes downstream operations very inefficient. """ - # range_meter_bin = 5 - # range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) - # ping_interval = np.array(list(ds_Sv.ping_time. - # resample(ping_time='20s', skipna=True).groups.keys())) - all_MVBS = [] for chan in ds_Sv.channel: @@ -536,7 +541,7 @@ def get_MVBS_along_channels( chan_MVBS = bin_and_mean_2d( sv.data, bins_time=ping_interval, - bins_er=range_interval, + bins_er=echo_range_interval, times=sv.ping_time.data, echo_range=ds["echo_range"].data, ) From d55ef170c0c0fc2efecf32b8b444b437892c3e38 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Mon, 7 Nov 2022 17:17:52 -0800 Subject: [PATCH 15/28] incorporate new bin and mean method into compute_MVBS and go back to old way of assuming echo_range is the same throughout ping_time --- echopype/preprocess/__init__.py | 3 +- echopype/preprocess/api.py | 207 +++++++++++++++++++++++++++----- 2 files changed, 180 insertions(+), 30 deletions(-) diff --git a/echopype/preprocess/__init__.py b/echopype/preprocess/__init__.py index c6f9606d5..1f7576bfd 100644 --- a/echopype/preprocess/__init__.py +++ b/echopype/preprocess/__init__.py @@ -1,7 +1,8 @@ -from .api import compute_MVBS, compute_MVBS_index_binning, remove_noise +from .api import compute_MVBS, compute_MVBS_index_binning, compute_MVBS_v2, remove_noise __all__ = [ "compute_MVBS", + "compute_MVBS_v2", # TODO: remove this! "compute_MVBS_index_binning", "remove_noise", ] diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index c61b4c1ad..8e195ae19 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -192,6 +192,122 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): return ds_MVBS +def compute_MVBS_v2(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): + """ + Compute Mean Volume Backscattering Strength (MVBS) + based on intervals of range (``echo_range``) and ``ping_time`` specified in physical units. + + Output of this function differs from that of ``compute_MVBS_index_binning``, which computes + bin-averaged Sv according to intervals of ``echo_range`` and ``ping_time`` specified as + index number. + + Parameters + ---------- + ds_Sv : xr.Dataset + dataset containing Sv and ``echo_range`` [m] + range_meter_bin : Union[int, float] + bin size along ``echo_range`` in meters, default to ``20`` + ping_time_bin : str + bin size along ``ping_time``, default to ``20S`` + + Returns + ------- + A dataset containing bin-averaged Sv + """ + + # TODO: remove this as it is no longer necessary + # if not ds_Sv["echo_range"].groupby("channel").apply(_check_range_uniqueness).all(): + # raise ValueError( + # "echo_range variable changes across pings in at least one of the frequency channels." + # ) + + # create bin information for echo_range + range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) + + # create bin information needed for ping_time + # ping_interval = np.array(list(ds_Sv.ping_time.resample(ping_time=ping_time_bin, + # skipna=True).groups.keys())) + + ping_interval = ( + ds_Sv.ping_time.resample(ping_time=ping_time_bin, skipna=True).asfreq().ping_time.values + ) + + # calculate the MVBS along each channel + MVBS_values = get_MVBS_along_channels(ds_Sv, range_interval, ping_interval) + + # create MVBS dataset + ds_MVBS = xr.Dataset( + data_vars={"Sv": (["channel", "ping_time", "echo_range"], MVBS_values)}, + coords={ + "ping_time": ping_interval, + "channel": ds_Sv.channel, + "echo_range": range_interval[:-1], + }, + ) + + # Added this check to support the test in test_process.py::test_compute_MVBS + if "filenames" in ds_MVBS.variables: + ds_MVBS = ds_MVBS.drop_vars("filenames") + + # ping_time_bin parsing and conversions + # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings + # https://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.resample.html + # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.html + # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html + # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects + # https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units + timedelta_units = { + "d": {"nptd64": "D", "unitstr": "day"}, + "h": {"nptd64": "h", "unitstr": "hour"}, + "t": {"nptd64": "m", "unitstr": "minute"}, + "min": {"nptd64": "m", "unitstr": "minute"}, + "s": {"nptd64": "s", "unitstr": "second"}, + "l": {"nptd64": "ms", "unitstr": "millisecond"}, + "ms": {"nptd64": "ms", "unitstr": "millisecond"}, + "u": {"nptd64": "us", "unitstr": "microsecond"}, + "us": {"nptd64": "ms", "unitstr": "millisecond"}, + "n": {"nptd64": "ns", "unitstr": "nanosecond"}, + "ns": {"nptd64": "ms", "unitstr": "millisecond"}, + } + ping_time_bin_td = pd.Timedelta(ping_time_bin) + # res = resolution (most granular time unit) + ping_time_bin_resunit = ping_time_bin_td.resolution_string.lower() + ping_time_bin_resvalue = int( + ping_time_bin_td / np.timedelta64(1, timedelta_units[ping_time_bin_resunit]["nptd64"]) + ) + ping_time_bin_resunit_label = timedelta_units[ping_time_bin_resunit]["unitstr"] + + # Attach attributes + _set_MVBS_attrs(ds_MVBS) + ds_MVBS["echo_range"].attrs = {"long_name": "Range distance", "units": "m"} + ds_MVBS["Sv"] = ds_MVBS["Sv"].assign_attrs( + { + "cell_methods": ( + f"ping_time: mean (interval: {ping_time_bin_resvalue} {ping_time_bin_resunit_label} " # noqa + "comment: ping_time is the interval start) " + f"echo_range: mean (interval: {range_meter_bin} meter " + "comment: echo_range is the interval start)" + ), + "binning_mode": "physical units", + "range_meter_interval": str(range_meter_bin) + "m", + "ping_time_interval": ping_time_bin, + "actual_range": [ + round(float(ds_MVBS["Sv"].min().values), 2), + round(float(ds_MVBS["Sv"].max().values), 2), + ], + } + ) + + prov_dict = echopype_prov_attrs(process_type="processing") + prov_dict["processing_function"] = "preprocess.compute_MVBS" + ds_MVBS = ds_MVBS.assign_attrs(prov_dict) + ds_MVBS["frequency_nominal"] = ds_Sv["frequency_nominal"] # re-attach frequency_nominal + + return ds_MVBS + # return MVBS_values + + def compute_MVBS_index_binning(ds_Sv, range_sample_num=100, ping_num=100): """ Compute Mean Volume Backscattering Strength (MVBS) @@ -454,46 +570,67 @@ def bin_and_mean_2d( """ # determine if array to bin is lazy or not - is_lazy = False - if isinstance(arr, dask.array.Array): - is_lazy = True + # is_lazy = False + # if isinstance(arr, dask.array.Array): + # is_lazy = True # get the number of echo range and time bins n_bin_er = len(bins_er) n_bin_time = len(bins_time) + print(f"n_bin_time = {n_bin_time}") # obtain the bin indices for echo_range and times digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) - all_means = [] - for bin_time in range(1, n_bin_time + 1): - - # get the indices of time in bin index bin_time - indices_time = np.argwhere(bin_time_ind == bin_time).flatten() - - # select only those array values that are in the time bin being considered - temp_arr = arr[indices_time, :] + binned_means = [] + for bin_er in range(1, n_bin_er): + er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) - # bin and mean with respect to echo_range bins - if is_lazy: - all_means.append( - dask.delayed(mean_temp_arr)( - n_bin_er, temp_arr, digitized_echo_range[indices_time, :] - ) - ) - else: - all_means.append( - mean_temp_arr(n_bin_er, temp_arr, digitized_echo_range[indices_time, :]) - ) + binned_means.append(er_selected_data) - if is_lazy: - # compute all constructs means - all_means = dask.compute(all_means)[0] # TODO: make this into persist in the future? + er_means = np.vstack(binned_means).compute() - # construct final reduced form of arr - final_reduced = np.array(all_means) + final = np.empty((n_bin_time, n_bin_er - 1)) + # for bin_time in range(1, n_bin_time + 1): + for bin_time in range(1, n_bin_time): + indices = np.argwhere(bin_time_ind == bin_time).flatten() - return final_reduced + if len(indices) == 0: + final[bin_time - 1, :] = np.nanmean(er_means[:, :], axis=1) # TODO: look into this + else: + final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1) + + return final + + # all_means = [] + # for bin_time in range(1, n_bin_time + 1): + # + # # get the indices of time in bin index bin_time + # indices_time = np.argwhere(bin_time_ind == bin_time).flatten() + # + # # select only those array values that are in the time bin being considered + # temp_arr = arr[indices_time, :] + # + # # bin and mean with respect to echo_range bins + # if is_lazy: + # all_means.append( + # dask.delayed(mean_temp_arr)( + # n_bin_er, temp_arr, digitized_echo_range[indices_time, :] + # ) + # ) + # else: + # all_means.append( + # mean_temp_arr(n_bin_er, temp_arr, digitized_echo_range[indices_time, :]) + # ) + + # if is_lazy: + # # compute all constructs means + # all_means = dask.compute(all_means)[0] # TODO: make this into persist in the future? + # + # # construct final reduced form of arr + # final_reduced = np.array(all_means) + # + # return final_reduced def get_MVBS_along_channels( @@ -534,6 +671,14 @@ def get_MVBS_along_channels( # TODO: not sure why not already removed for the AZFP case. Investigate. ds = ds_Sv.sel(channel=chan).squeeze() + echo_range = ( + ds["echo_range"] + .dropna(dim="range_sample", how="all") + .dropna(dim="ping_time") + .isel(ping_time=0) + .values + ) + # average should be done in linear domain sv = 10 ** (ds["Sv"] / 10) @@ -543,9 +688,13 @@ def get_MVBS_along_channels( bins_time=ping_interval, bins_er=echo_range_interval, times=sv.ping_time.data, - echo_range=ds["echo_range"].data, + echo_range=echo_range, # ds["echo_range"].data, ) + # all_MVBS.append(chan_MVBS) + # all_MVBS.extend(chan_MVBS) + # return all_MVBS + # apply inverse mapping to get back to the original domain and store values all_MVBS.append(10 * np.log10(chan_MVBS)) From cc02dadcc4bc0fb32e709ca3d3ae8470ac904857 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 8 Nov 2022 09:22:15 -0800 Subject: [PATCH 16/28] make the MVBS output values of new method be equivalent to the old method by changing the bins produced by the resample method --- echopype/preprocess/api.py | 117 +++++++++---------------------------- 1 file changed, 26 insertions(+), 91 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 8e195ae19..ec33959e9 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -2,7 +2,7 @@ Functions for enhancing the spatial and temporal coherence of data. """ -from typing import List, Tuple, Union +from typing import Tuple, Union import dask.array import numpy as np @@ -225,9 +225,6 @@ def compute_MVBS_v2(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) # create bin information needed for ping_time - # ping_interval = np.array(list(ds_Sv.ping_time.resample(ping_time=ping_time_bin, - # skipna=True).groups.keys())) - ping_interval = ( ds_Sv.ping_time.resample(ping_time=ping_time_bin, skipna=True).asfreq().ping_time.values ) @@ -305,7 +302,6 @@ def compute_MVBS_v2(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): ds_MVBS["frequency_nominal"] = ds_Sv["frequency_nominal"] # re-attach frequency_nominal return ds_MVBS - # return MVBS_values def compute_MVBS_index_binning(ds_Sv, range_sample_num=100, ping_num=100): @@ -494,42 +490,6 @@ def get_bin_indices( return digitized_echo_range, bin_time_ind -def mean_temp_arr( - n_bin_er: int, - temp_arr: Union[dask.array.Array, np.ndarray], - dig_er_subset: Union[dask.array.Array, np.ndarray], -) -> List[Union[dask.array.Array, np.ndarray]]: - """ - Bins the data in ``temp_arr`` with respect to the - ``echo_range`` bin and means the resulting bin. - - Parameters - ---------- - n_bin_er: int - The number of echo range bins - temp_arr: dask.array.Array or np.ndarray - Array of Sv values at the ``ping_time`` bin index being considered - dig_er_subset: dask.array.Array or np.ndarray - Array representing the digitized (bin indices) for ``echo_range`` at - the ``ping_time`` bin index being considered - - Returns - ------- - means: list of dask.array.Array or np.ndarray - - Notes - ----- - It is necessary for this to be a function because we may need to - delay it. - """ - - means = [] - for bin_er in range(1, n_bin_er): - means.append(np.nanmean(temp_arr[dig_er_subset == bin_er], axis=0)) - - return means - - def bin_and_mean_2d( arr: Union[dask.array.Array, np.ndarray], bins_time: np.ndarray, @@ -565,73 +525,52 @@ def bin_and_mean_2d( This function assumes that ``arr`` has rows corresponding to ``ping_time`` and columns corresponding to ``echo_range``. - This function allows the number of ``echo_range`` values to + This function should not be run if the number of ``echo_range`` values vary amongst ``ping_times``. """ - # determine if array to bin is lazy or not - # is_lazy = False - # if isinstance(arr, dask.array.Array): - # is_lazy = True - # get the number of echo range and time bins n_bin_er = len(bins_er) n_bin_time = len(bins_time) - print(f"n_bin_time = {n_bin_time}") # obtain the bin indices for echo_range and times digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) binned_means = [] for bin_er in range(1, n_bin_er): + + # bin and mean echo_range dimension er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) + # collect all echo_range bins binned_means.append(er_selected_data) - er_means = np.vstack(binned_means).compute() + # create full echo_range binned array + er_means = np.vstack(binned_means) + + # if er_means is a dask array we compute it so the graph does not get too large + if isinstance(er_means, dask.array.Array): + er_means = er_means.compute() + # create final reduced array final = np.empty((n_bin_time, n_bin_er - 1)) - # for bin_time in range(1, n_bin_time + 1): - for bin_time in range(1, n_bin_time): + + for bin_time in range(1, n_bin_time + 1): + + # obtain er_mean indices corresponding to the time bin indices = np.argwhere(bin_time_ind == bin_time).flatten() if len(indices) == 0: - final[bin_time - 1, :] = np.nanmean(er_means[:, :], axis=1) # TODO: look into this + + # fill values with NaN, if there are no values in the bin + final[bin_time - 1, :] = np.nan else: + + # bin and mean the er_mean time bin final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1) return final - # all_means = [] - # for bin_time in range(1, n_bin_time + 1): - # - # # get the indices of time in bin index bin_time - # indices_time = np.argwhere(bin_time_ind == bin_time).flatten() - # - # # select only those array values that are in the time bin being considered - # temp_arr = arr[indices_time, :] - # - # # bin and mean with respect to echo_range bins - # if is_lazy: - # all_means.append( - # dask.delayed(mean_temp_arr)( - # n_bin_er, temp_arr, digitized_echo_range[indices_time, :] - # ) - # ) - # else: - # all_means.append( - # mean_temp_arr(n_bin_er, temp_arr, digitized_echo_range[indices_time, :]) - # ) - - # if is_lazy: - # # compute all constructs means - # all_means = dask.compute(all_means)[0] # TODO: make this into persist in the future? - # - # # construct final reduced form of arr - # final_reduced = np.array(all_means) - # - # return final_reduced - def get_MVBS_along_channels( ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray @@ -658,10 +597,10 @@ def get_MVBS_along_channels( Notes ----- If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with - respect to ``echo_range`` will take place, then the binning and mean with respect to - ``ping_time`` will be a delayed operation, and lastly the delayed values will be - computed. It is necessary to apply a compute at the end of this method because Dask - graph layers get too large and this makes downstream operations very inefficient. + respect to ``echo_range`` will take place, then the delayed result will be computed, + and lastly the binning and mean with respect to ``ping_time`` will be completed. It + is necessary to apply a compute midway through this method because Dask graph layers + get too large and this makes downstream operations very inefficient. """ all_MVBS = [] @@ -688,13 +627,9 @@ def get_MVBS_along_channels( bins_time=ping_interval, bins_er=echo_range_interval, times=sv.ping_time.data, - echo_range=echo_range, # ds["echo_range"].data, + echo_range=echo_range, ) - # all_MVBS.append(chan_MVBS) - # all_MVBS.extend(chan_MVBS) - # return all_MVBS - # apply inverse mapping to get back to the original domain and store values all_MVBS.append(10 * np.log10(chan_MVBS)) From 7dadbf47f091934a276f2adc4a152bda9f95761f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 8 Nov 2022 11:27:24 -0800 Subject: [PATCH 17/28] add code section that groups different echo_range values, computes them, and then constructs the appropriate er_means array --- echopype/preprocess/api.py | 79 ++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index ec33959e9..5f6eab54a 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -490,6 +490,24 @@ def get_bin_indices( return digitized_echo_range, bin_time_ind +def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er): + + # TODO: document! + + binned_means = [] + for bin_er in range(1, n_bin_er): + # bin and mean echo_range dimension + er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) + + # collect all echo_range bins + binned_means.append(er_selected_data) + + # create full echo_range binned array + er_means = np.vstack(binned_means) + + return er_means + + def bin_and_mean_2d( arr: Union[dask.array.Array, np.ndarray], bins_time: np.ndarray, @@ -536,17 +554,52 @@ def bin_and_mean_2d( # obtain the bin indices for echo_range and times digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) - binned_means = [] - for bin_er in range(1, n_bin_er): + # TODO: clean up the below code! - # bin and mean echo_range dimension - er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) + # compute bin indices to allow for downstream processes (mainly axis argument in unique) + if isinstance(digitized_echo_range, dask.array.Array): + digitized_echo_range = digitized_echo_range.compute() - # collect all echo_range bins - binned_means.append(er_selected_data) + unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True) - # create full echo_range binned array - er_means = np.vstack(binned_means) + # TODO: put the below into its own function + if unique_er_bin_ind.shape[0] != 1: + + print("grouping necessary") + print(unique_er_bin_ind.shape[0]) + print(unique_er_bin_ind) + grps_same_ind = [ + np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) + ] + + binned_er = [] # the values appended may not be in the correct order + for count, grp in enumerate(grps_same_ind): + binned_er.append( + bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er) + ) + + # put the columns in the correct order + binned_er_array = np.hstack(binned_er) + correct_column_ind = np.argsort(np.concatenate(grps_same_ind)) + er_means = binned_er_array[:, correct_column_ind] + + else: + print("no grouping necessary") + # make sure this correctly works + er_means = bin_and_mean_echo_range(arr, unique_er_bin_ind[0, :], n_bin_er) + + # binned_means = [] + # for bin_er in range(1, n_bin_er): + # + # # bin and mean echo_range dimension + # er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) + # + # # collect all echo_range bins + # binned_means.append(er_selected_data) + # + # # create full echo_range binned array + # er_means = np.vstack(binned_means) + # # if er_means is a dask array we compute it so the graph does not get too large if isinstance(er_means, dask.array.Array): @@ -610,14 +663,6 @@ def get_MVBS_along_channels( # TODO: not sure why not already removed for the AZFP case. Investigate. ds = ds_Sv.sel(channel=chan).squeeze() - echo_range = ( - ds["echo_range"] - .dropna(dim="range_sample", how="all") - .dropna(dim="ping_time") - .isel(ping_time=0) - .values - ) - # average should be done in linear domain sv = 10 ** (ds["Sv"] / 10) @@ -627,7 +672,7 @@ def get_MVBS_along_channels( bins_time=ping_interval, bins_er=echo_range_interval, times=sv.ping_time.data, - echo_range=echo_range, + echo_range=ds["echo_range"], ) # apply inverse mapping to get back to the original domain and store values From 42d18a85cc02b84d19869029734566ebb9fb4d5f Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 8 Nov 2022 16:31:11 -0800 Subject: [PATCH 18/28] add two functions (less and more comprehensive) that determine if grouping echo_range with respect to ping_time is needed --- echopype/preprocess/api.py | 126 +++++++++++++++---- echopype/tests/preprocess/test_preprocess.py | 2 + 2 files changed, 104 insertions(+), 24 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 5f6eab54a..0cb7befcd 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -508,12 +508,96 @@ def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er): return er_means +def get_unequal_rows(mat, row): + # compare row against all rows in mat (allowing for NaNs to be equal) + element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) + + # get those row indices that are not equal to row + row_not_equal = np.argwhere(np.logical_not(np.all(element_nan_equal, axis=1))).flatten() + + return row_not_equal + + +def is_grouping_needed_comprehensive(er_chan): + + if isinstance(er_chan, xr.DataArray): + er_chan = er_chan.values + + # grab the first ping_time that is not filled with NaNs + ping_index = 0 + while np.all(np.isnan(er_chan[ping_index, :])): + ping_index += 1 + + unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :]) + + if len(unequal_ping_ind) > 0: + + # see if all unequal_ping_ind are filled with NaNs + all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1)) + + if all_nans: + # All echo_range values have the same step size + return False + else: + # Some echo_range values have different step sizes + return True + else: + # All echo_range values have the same step size + return False + + +def is_grouping_needed_less_comprehensive(er_chan): + """ + + + Parameters + ---------- + er_chan + + Notes + ----- + Although this method is slightly faster than ``is_grouping_needed_comprehensive``, + it is possible that this method will incorrectly determine if grouping + is necessary. + """ + + # determine the number of NaNs in each ping and find the unique number of NaNs + unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1)) + + if isinstance(unique_num_nans, dask.array.Array): + unique_num_nans = unique_num_nans.compute() + + # determine if any value is not 0 or er_chan.shape[1] + unexpected_num_nans = False in np.logical_or( + unique_num_nans == 0, unique_num_nans == er_chan.shape[1] + ) + + if unexpected_num_nans: + # echo_range varies with ping_time + return True + else: + + # make sure that the final echo_range value for each ping_time is the same (account for NaN) + num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum() + + if isinstance(num_non_nans, dask.array.Array): + num_non_nans = num_non_nans.compute() + + if num_non_nans > 1: + # echo_range varies with ping_time + return True + else: + # echo_range does not vary with ping_time + return False + + def bin_and_mean_2d( arr: Union[dask.array.Array, np.ndarray], bins_time: np.ndarray, bins_er: np.ndarray, times: np.ndarray, echo_range: np.ndarray, + comp_er_check: bool = True, ) -> np.ndarray: """ Bins and means ``arr`` based on ``times`` and ``echo_range``, @@ -532,6 +616,9 @@ def bin_and_mean_2d( 1D array corresponding to the time values that should be binned echo_range: np.ndarray 2D array of echo range values + comp_er_check: bool + If True, a more comprehensive check will be completed to determine if ``echo_range`` + grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done Returns ------- @@ -556,18 +643,23 @@ def bin_and_mean_2d( # TODO: clean up the below code! - # compute bin indices to allow for downstream processes (mainly axis argument in unique) - if isinstance(digitized_echo_range, dask.array.Array): - digitized_echo_range = digitized_echo_range.compute() - - unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True) + if comp_er_check: + grouping_needed = is_grouping_needed_comprehensive(echo_range) + else: + grouping_needed = is_grouping_needed_less_comprehensive(echo_range) - # TODO: put the below into its own function - if unique_er_bin_ind.shape[0] != 1: + # TODO: put the below into its own function + if grouping_needed: print("grouping necessary") - print(unique_er_bin_ind.shape[0]) - print(unique_er_bin_ind) + + # compute bin indices to allow for downstream processes (mainly axis argument in unique) + if isinstance(digitized_echo_range, dask.array.Array): + digitized_echo_range = digitized_echo_range.compute() + + unique_er_bin_ind, unique_inverse = np.unique( + digitized_echo_range, axis=0, return_inverse=True + ) grps_same_ind = [ np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) ] @@ -585,21 +677,7 @@ def bin_and_mean_2d( else: print("no grouping necessary") - # make sure this correctly works - er_means = bin_and_mean_echo_range(arr, unique_er_bin_ind[0, :], n_bin_er) - - # binned_means = [] - # for bin_er in range(1, n_bin_er): - # - # # bin and mean echo_range dimension - # er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) - # - # # collect all echo_range bins - # binned_means.append(er_selected_data) - # - # # create full echo_range binned array - # er_means = np.vstack(binned_means) - # + er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er) # if er_means is a dask array we compute it so the graph does not get too large if isinstance(er_means, dask.array.Array): diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index 7a6701795..87f9c7adf 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -860,6 +860,8 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: the highest possible number of echo range values in a given bin """ + # TODO: add a seed here and print it so we can debug if necessary + # get all parameters needed to create the mock data create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params From e666950adbb56839babcf5c31a61d1a33d655687 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Tue, 8 Nov 2022 16:49:52 -0800 Subject: [PATCH 19/28] modify get_unequal_rows to account for dask input and add bool input to bin_and_mean_2d --- echopype/preprocess/api.py | 11 +++++++++-- echopype/tests/preprocess/test_preprocess.py | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 0cb7befcd..b6f018f59 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -512,10 +512,16 @@ def get_unequal_rows(mat, row): # compare row against all rows in mat (allowing for NaNs to be equal) element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) + # determine if mat row is equal to row + row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1)) + + if isinstance(row_not_equal, dask.array.Array): + row_not_equal = row_not_equal.compute() + # get those row indices that are not equal to row - row_not_equal = np.argwhere(np.logical_not(np.all(element_nan_equal, axis=1))).flatten() + row_ind_not_equal = np.argwhere(row_not_equal).flatten() - return row_not_equal + return row_ind_not_equal def is_grouping_needed_comprehensive(er_chan): @@ -751,6 +757,7 @@ def get_MVBS_along_channels( bins_er=echo_range_interval, times=sv.ping_time.data, echo_range=ds["echo_range"], + comp_er_check=True, ) # apply inverse mapping to get back to the original domain and store values diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index 87f9c7adf..a4a8b03ee 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -880,7 +880,8 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: # calculate MVBS for mock data set calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin, - bins_er=digitize_er_bin, times=final_ping_time, echo_range=final_er) + bins_er=digitize_er_bin, times=final_ping_time, + echo_range=final_er, comp_er_check=True) # compare known MVBS solution against its calculated counterpart assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10) From cc8770cb402dd5963904b90559ea5b118510a835 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 9 Nov 2022 09:16:10 -0800 Subject: [PATCH 20/28] add seed to test_bin_and_mean_2d and randomly choose a ping bin to be completely NaN --- echopype/preprocess/api.py | 3 - echopype/tests/preprocess/test_preprocess.py | 81 +++++++++++++++----- 2 files changed, 62 insertions(+), 22 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index b6f018f59..20e7fc3c6 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -657,8 +657,6 @@ def bin_and_mean_2d( # TODO: put the below into its own function if grouping_needed: - print("grouping necessary") - # compute bin indices to allow for downstream processes (mainly axis argument in unique) if isinstance(digitized_echo_range, dask.array.Array): digitized_echo_range = digitized_echo_range.compute() @@ -682,7 +680,6 @@ def bin_and_mean_2d( er_means = binned_er_array[:, correct_column_ind] else: - print("no grouping necessary") er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er) # if er_means is a dask array we compute it so the graph does not get too large diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index a4a8b03ee..40eb058ca 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -4,6 +4,7 @@ import echopype as ep import pytest import dask.array +from numpy.random import default_rng from echopype.preprocess.api import bin_and_mean_2d from typing import Tuple, Iterable, Union @@ -550,7 +551,9 @@ def create_echo_range_related_data(ping_bins: Iterable, num_pings_in_bin: np.ndarray, er_range: list, er_bins: Iterable, final_num_er_bins: int, - create_dask: bool) -> Tuple[list, list, list]: + create_dask: bool, + rng: np.random.Generator, + ping_bin_nan_ind: int) -> Tuple[list, list, list]: """ Creates ``echo_range`` values and associated bin information. @@ -570,6 +573,10 @@ def create_echo_range_related_data(ping_bins: Iterable, create_dask: bool If True ``final_arrays`` values will be dask arrays, else they will be numpy arrays + rng: np.random.Generator + The generator for random values + ping_bin_nan_ind: int + The ping bin index to fill with NaNs Returns ------- @@ -591,10 +598,10 @@ def create_echo_range_related_data(ping_bins: Iterable, for ping_ind, ping_bin in enumerate(ping_bins): # create the ping times associated with each ping bin - ping_times_in_bin.append(np.random.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],))) + ping_times_in_bin.append(rng.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],))) # randomly determine the number of values in each echo_range bin - num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins) + num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins) # store the number of values in each echo_range bin all_er_bin_nums.append(num_er_in_bin) @@ -607,12 +614,16 @@ def create_echo_range_related_data(ping_bins: Iterable, a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], num_er_in_bin[count])) else: - a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], - num_er_in_bin[count])) + a = rng.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], + num_er_in_bin[count])) # store the block of echo_range values er_row_block.append(a) + # set all echo_range values at ping index to NaN + if ping_ind == ping_bin_nan_ind: + a[:, :] = np.nan + # collect and construct echo_range row block final_arrays.append(np.concatenate(er_row_block, axis=1)) @@ -670,8 +681,9 @@ def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, all_er_bin_nums: Iterable[np.ndarray], num_pings_in_bin: np.ndarray, - create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array], - np.ndarray]: + create_dask: bool, + ping_bin_nan_ind: int) -> Tuple[Union[np.ndarray, dask.array.Array], + np.ndarray]: """ Creates the final 2D Sv array with appropriate padding. @@ -689,6 +701,8 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, Specifies the number of pings in each ping time bin create_dask: bool If True ``final_sv`` will be a dask array, else it will be a numpy array + ping_bin_nan_ind: int + The ping bin index to fill with NaNs Returns ------- @@ -712,17 +726,26 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, for count, arr in enumerate(all_er_bin_nums): # create sv row values using natural numbers - sv_row_list = [np.arange(1, num_elem + 1, 1) for num_elem in arr] + sv_row_list = [np.arange(1, num_elem + 1, 1, dtype=np.float64) for num_elem in arr] # create final sv row sv_row = np.concatenate(sv_row_list) # get final mean which is n+1/2 (since we are using natural numbers) - final_means.append([(len(elem) + 1) / 2.0 for elem in sv_row_list]) + ping_mean = [(len(elem) + 1) / 2.0 for elem in sv_row_list] # create sv row block sv_row_block = np.tile(sv_row, (num_pings_in_bin[count], 1)) + if count == ping_bin_nan_ind: + + # fill values with NaNs + ping_mean = [np.nan]*len(sv_row_list) + sv_row_block[:, :] = np.nan + + # store means for ping + final_means.append(ping_mean) + if count == 0: final_sv[0:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block else: @@ -737,8 +760,9 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, def create_known_mean_data(final_num_ping_bins: int, final_num_er_bins: int, ping_range: list, - er_range: list, create_dask: bool) -> Tuple[np.ndarray, np.ndarray, Iterable, - Iterable, np.ndarray, np.ndarray]: + er_range: list, create_dask: bool, + rng: np.random.Generator) -> Tuple[np.ndarray, np.ndarray, Iterable, + Iterable, np.ndarray, np.ndarray]: """ Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays where the MVBS is known. @@ -758,6 +782,8 @@ def create_known_mean_data(final_num_ping_bins: int, create_dask: bool If True the ``Sv`` and ``echo_range`` values produced will be dask arrays, else they will be numpy arrays. + rng: np.random.Generator + generator for random integers Returns ------- @@ -776,22 +802,27 @@ def create_known_mean_data(final_num_ping_bins: int, """ # randomly generate the number of pings in each ping bin - num_pings_in_bin = np.random.randint(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins) + num_pings_in_bin = rng.integers(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins) # create bins for ping_time dimension ping_csum = np.cumsum(num_pings_in_bin) ping_bins = create_bins(ping_csum) # create bins for echo_range dimension - num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins) + num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins) er_csum = np.cumsum(num_er_in_bin) er_bins = create_bins(er_csum) + # randomly select one ping bin to fill with NaNs + ping_bin_nan_ind = rng.choice(len(ping_bins)) + # create the echo_range data and associated bin information all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin, er_range, er_bins, final_num_er_bins, - create_dask) + create_dask, + rng, + ping_bin_nan_ind) # create the final echo_range array using created data and padding final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum, create_dask) @@ -801,7 +832,8 @@ def create_known_mean_data(final_num_ping_bins: int, # create the final sv array final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, - all_er_bin_nums, num_pings_in_bin, create_dask) + all_er_bin_nums, num_pings_in_bin, + create_dask, ping_bin_nan_ind) return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time @@ -860,16 +892,27 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: the highest possible number of echo range values in a given bin """ - # TODO: add a seed here and print it so we can debug if necessary - # get all parameters needed to create the mock data create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params + # randomly generate a seed + seed = np.random.randint(low=10, high=100000, size=1)[0] + + print(f"seed used to generate mock data: {seed}") + + # establish generator for random integers + rng = default_rng(seed=seed) + + # seed dask random generator + if create_dask: + dask.array.random.seed(seed=seed) + # create echo_range, ping_time, and Sv arrays where the MVBS is known known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range, - create_dask) + create_dask, + rng) # put the created ping bins into a form that works with bin_and_mean_2d digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]]) @@ -884,4 +927,4 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: echo_range=final_er, comp_er_check=True) # compare known MVBS solution against its calculated counterpart - assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10) + assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True) From 12686adc12cefadff6edb22342bc4614929ff506 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 9 Nov 2022 09:24:11 -0800 Subject: [PATCH 21/28] remove old functions associated with compute_MVBS and use the new compute_MVBS methods only --- echopype/preprocess/__init__.py | 3 +- echopype/preprocess/api.py | 169 ++------------------------------ 2 files changed, 9 insertions(+), 163 deletions(-) diff --git a/echopype/preprocess/__init__.py b/echopype/preprocess/__init__.py index 1f7576bfd..c6f9606d5 100644 --- a/echopype/preprocess/__init__.py +++ b/echopype/preprocess/__init__.py @@ -1,8 +1,7 @@ -from .api import compute_MVBS, compute_MVBS_index_binning, compute_MVBS_v2, remove_noise +from .api import compute_MVBS, compute_MVBS_index_binning, remove_noise __all__ = [ "compute_MVBS", - "compute_MVBS_v2", # TODO: remove this! "compute_MVBS_index_binning", "remove_noise", ] diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 20e7fc3c6..58ef91fd7 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -13,63 +13,6 @@ from .noise_est import NoiseEst -def _check_range_uniqueness(da): - """ - Check if range (``echo_range``) changes across ping in a given frequency channel. - """ - # squeeze to remove "channel" dim if present - # TODO: not sure why not already removed for the AZFP case. Investigate. - da = da.squeeze() - - # remove pings with NaN entries if exist - # since goal here is to check uniqueness - if np.unique(da.isnull(), axis=0).shape[0] != 1: - da = da.dropna(dim="ping_time", how="any") - - # remove padded NaN entries if exist for all pings - da = da.dropna(dim="range_sample", how="all") - - ping_time_idx = np.argwhere([dim == "ping_time" for dim in da.dims])[0][0] - if np.unique(da, axis=ping_time_idx).shape[ping_time_idx] == 1: - return xr.DataArray(data=True, coords={"channel": da["channel"].values}) - else: - return xr.DataArray(data=False, coords={"channel": da["channel"].values}) - - -def _freq_MVBS(ds, rint, pbin): - # squeeze to remove "channel" dim if present - # TODO: not sure why not already removed for the AZFP case. Investigate. - ds = ds.squeeze() - - # average should be done in linear domain - sv = 10 ** (ds["Sv"] / 10) - - # set 1D coordinate using the 1st ping echo_range since identical for all pings - # remove padded NaN entries if exist for all pings - er = ( - ds["echo_range"] - .dropna(dim="range_sample", how="all") - .dropna(dim="ping_time") - .isel(ping_time=0) - ) - - # use first ping er as indexer for sv - sv = sv.sel(range_sample=er.range_sample.values) - sv.coords["echo_range"] = (["range_sample"], er.values) - sv = sv.swap_dims({"range_sample": "echo_range"}) - sv_groupby_bins = ( - sv.resample(ping_time=pbin, skipna=True) - .mean(skipna=True) - .groupby_bins("echo_range", bins=rint, right=False, include_lowest=True) - .mean(skipna=True) - ) - sv_groupby_bins.coords["echo_range"] = (["echo_range_bins"], rint[:-1]) - sv_groupby_bins = sv_groupby_bins.swap_dims({"echo_range_bins": "echo_range"}).drop_vars( - "echo_range_bins" - ) - return 10 * np.log10(sv_groupby_bins) - - def _set_MVBS_attrs(ds): """ Attach common attributes. @@ -118,109 +61,6 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): A dataset containing bin-averaged Sv """ - if not ds_Sv["echo_range"].groupby("channel").apply(_check_range_uniqueness).all(): - raise ValueError( - "echo_range variable changes across pings in at least one of the frequency channels." - ) - - # Groupby freq in case of different echo_range (from different sampling intervals) - range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) - ds_MVBS = ( - ds_Sv.groupby("channel") - .apply(_freq_MVBS, args=(range_interval, ping_time_bin)) - .to_dataset() - ) - # Added this check to support the test in test_process.py::test_compute_MVBS - if "filenames" in ds_MVBS.variables: - ds_MVBS = ds_MVBS.drop_vars("filenames") - - # ping_time_bin parsing and conversions - # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings - # https://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.resample.html - # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.html - # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html - # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects - # https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units - timedelta_units = { - "d": {"nptd64": "D", "unitstr": "day"}, - "h": {"nptd64": "h", "unitstr": "hour"}, - "t": {"nptd64": "m", "unitstr": "minute"}, - "min": {"nptd64": "m", "unitstr": "minute"}, - "s": {"nptd64": "s", "unitstr": "second"}, - "l": {"nptd64": "ms", "unitstr": "millisecond"}, - "ms": {"nptd64": "ms", "unitstr": "millisecond"}, - "u": {"nptd64": "us", "unitstr": "microsecond"}, - "us": {"nptd64": "ms", "unitstr": "millisecond"}, - "n": {"nptd64": "ns", "unitstr": "nanosecond"}, - "ns": {"nptd64": "ms", "unitstr": "millisecond"}, - } - ping_time_bin_td = pd.Timedelta(ping_time_bin) - # res = resolution (most granular time unit) - ping_time_bin_resunit = ping_time_bin_td.resolution_string.lower() - ping_time_bin_resvalue = int( - ping_time_bin_td / np.timedelta64(1, timedelta_units[ping_time_bin_resunit]["nptd64"]) - ) - ping_time_bin_resunit_label = timedelta_units[ping_time_bin_resunit]["unitstr"] - - # Attach attributes - _set_MVBS_attrs(ds_MVBS) - ds_MVBS["echo_range"].attrs = {"long_name": "Range distance", "units": "m"} - ds_MVBS["Sv"] = ds_MVBS["Sv"].assign_attrs( - { - "cell_methods": ( - f"ping_time: mean (interval: {ping_time_bin_resvalue} {ping_time_bin_resunit_label} " # noqa - "comment: ping_time is the interval start) " - f"echo_range: mean (interval: {range_meter_bin} meter " - "comment: echo_range is the interval start)" - ), - "binning_mode": "physical units", - "range_meter_interval": str(range_meter_bin) + "m", - "ping_time_interval": ping_time_bin, - "actual_range": [ - round(float(ds_MVBS["Sv"].min().values), 2), - round(float(ds_MVBS["Sv"].max().values), 2), - ], - } - ) - - prov_dict = echopype_prov_attrs(process_type="processing") - prov_dict["processing_function"] = "preprocess.compute_MVBS" - ds_MVBS = ds_MVBS.assign_attrs(prov_dict) - ds_MVBS["frequency_nominal"] = ds_Sv["frequency_nominal"] # re-attach frequency_nominal - - return ds_MVBS - - -def compute_MVBS_v2(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): - """ - Compute Mean Volume Backscattering Strength (MVBS) - based on intervals of range (``echo_range``) and ``ping_time`` specified in physical units. - - Output of this function differs from that of ``compute_MVBS_index_binning``, which computes - bin-averaged Sv according to intervals of ``echo_range`` and ``ping_time`` specified as - index number. - - Parameters - ---------- - ds_Sv : xr.Dataset - dataset containing Sv and ``echo_range`` [m] - range_meter_bin : Union[int, float] - bin size along ``echo_range`` in meters, default to ``20`` - ping_time_bin : str - bin size along ``ping_time``, default to ``20S`` - - Returns - ------- - A dataset containing bin-averaged Sv - """ - - # TODO: remove this as it is no longer necessary - # if not ds_Sv["echo_range"].groupby("channel").apply(_check_range_uniqueness).all(): - # raise ValueError( - # "echo_range variable changes across pings in at least one of the frequency channels." - # ) - # create bin information for echo_range range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) @@ -509,6 +349,9 @@ def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er): def get_unequal_rows(mat, row): + + # TODO: document! + # compare row against all rows in mat (allowing for NaNs to be equal) element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) @@ -526,6 +369,8 @@ def get_unequal_rows(mat, row): def is_grouping_needed_comprehensive(er_chan): + # TODO: document! + if isinstance(er_chan, xr.DataArray): er_chan = er_chan.values @@ -567,6 +412,8 @@ def is_grouping_needed_less_comprehensive(er_chan): is necessary. """ + # TODO: document! + # determine the number of NaNs in each ping and find the unique number of NaNs unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1)) @@ -654,7 +501,7 @@ def bin_and_mean_2d( else: grouping_needed = is_grouping_needed_less_comprehensive(echo_range) - # TODO: put the below into its own function + # TODO: put the below into its own function if grouping_needed: # compute bin indices to allow for downstream processes (mainly axis argument in unique) From 646f45c18db3267daff778ab8fab0a3db7753ad4 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 10 Nov 2022 15:55:36 -0800 Subject: [PATCH 22/28] comment and add docstrings to previously undocumented functions required for compute_MVBS --- echopype/preprocess/api.py | 96 ++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 13 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 58ef91fd7..1cb19ad5a 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -330,12 +330,30 @@ def get_bin_indices( return digitized_echo_range, bin_time_ind -def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er): +def bin_and_mean_echo_range( + arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int +) -> Union[np.ndarray, dask.array.Array]: + """ + Bins and means ``arr`` with respect to the ``echo_range`` bins. - # TODO: document! + Parameters + ---------- + arr: np.ndarray or dask.array.Array + 2D array to bin and mean + digitized_echo_range: np.ndarray + 2D array of bin indices for ``echo_range`` + n_bin_er: int + The number of echo range bins + + Returns + ------- + er_means: np.ndarray or dask.array.Array + 2D array representing the bin and mean of ``arr`` along ``echo_range`` + """ binned_means = [] for bin_er in range(1, n_bin_er): + # bin and mean echo_range dimension er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) @@ -348,9 +366,29 @@ def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er): return er_means -def get_unequal_rows(mat, row): +def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray: + """ + Obtains those row indices of ``mat`` that are not equal + to ``row``. + + Parameters + ---------- + mat: np.ndarray + 2D array with the same column dimension as the number + of elements in ``row`` + row: np.ndarray + 1D array with the same number of element elements as + the column dimension of ``mat`` - # TODO: document! + Returns + ------- + row_ind_not_equal: np.ndarray + The row indices of ``mat`` that are not equal to ``row`` + + Notes + ----- + Elements with NaNs are considered equal if they are in the same position. + """ # compare row against all rows in mat (allowing for NaNs to be equal) element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) @@ -367,10 +405,30 @@ def get_unequal_rows(mat, row): return row_ind_not_equal -def is_grouping_needed_comprehensive(er_chan): +def is_grouping_needed_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: + """ + A comprehensive check that determines if all ``echo_range`` values + along ``ping_time`` have the same step size. If they do not have + the same step sizes, then grouping of the ``echo_range`` values + will be necessary. + + Parameters + ---------- + er_chan: xr.DataArray or np.ndarray + 2D array containing the ``echo_range`` values for each ``ping_time`` + + Returns + ------- + bool + True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - # TODO: document! + Notes + ----- + ``er_chan`` should have rows corresponding to ``ping_time`` and columns + corresponding to ``range_sample`` + """ + # grab the in-memory numpy echo_range values, if necessary if isinstance(er_chan, xr.DataArray): er_chan = er_chan.values @@ -379,6 +437,7 @@ def is_grouping_needed_comprehensive(er_chan): while np.all(np.isnan(er_chan[ping_index, :])): ping_index += 1 + # determine those rows of er_chan that are not equal to the row ping_index unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :]) if len(unequal_ping_ind) > 0: @@ -397,26 +456,36 @@ def is_grouping_needed_comprehensive(er_chan): return False -def is_grouping_needed_less_comprehensive(er_chan): +def is_grouping_needed_less_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: """ - + An alternative (less comprehensive) check that determines if all + ``echo_range`` values along ``ping_time`` have the same step size. + If they do not have the same step sizes, then grouping of the + ``echo_range`` values will be necessary. Parameters ---------- - er_chan + er_chan: xr.DataArray or np.ndarray + 2D array containing the ``echo_range`` values for each ``ping_time`` + + Returns + ------- + bool + True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False Notes ----- - Although this method is slightly faster than ``is_grouping_needed_comprehensive``, - it is possible that this method will incorrectly determine if grouping + It is possible that this method will incorrectly determine if grouping is necessary. - """ - # TODO: document! + ``er_chan`` should have rows corresponding to ``ping_time`` and columns + corresponding to ``range_sample`` + """ # determine the number of NaNs in each ping and find the unique number of NaNs unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1)) + # compute the results, if necessary, to allow for downstream checks if isinstance(unique_num_nans, dask.array.Array): unique_num_nans = unique_num_nans.compute() @@ -433,6 +502,7 @@ def is_grouping_needed_less_comprehensive(er_chan): # make sure that the final echo_range value for each ping_time is the same (account for NaN) num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum() + # compute the results, if necessary, to allow for downstream checks if isinstance(num_non_nans, dask.array.Array): num_non_nans = num_non_nans.compute() From ff095cdd2a6df2519203f5a2f788fb4ea8a0316e Mon Sep 17 00:00:00 2001 From: b-reyes Date: Thu, 10 Nov 2022 16:30:37 -0800 Subject: [PATCH 23/28] clean up bin_and_mean_2d code by creating a function for grouping, binning, and calculating the mean of an array with respect to echo_range --- echopype/preprocess/api.py | 89 +++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 30 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 1cb19ad5a..e65f53a28 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -514,6 +514,60 @@ def is_grouping_needed_less_comprehensive(er_chan: Union[xr.DataArray, np.ndarra return False +def group_bin_mean_echo_range( + arr: Union[np.ndarray, dask.array.Array], + digitized_echo_range: Union[np.ndarray, dask.array.Array], + n_bin_er: int, +) -> Union[np.ndarray, dask.array.Array]: + """ + Groups the rows of ``arr`` such that they have the same corresponding + row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range`` + on each group, and lastly assembles the correctly ordered ``er_means`` array + representing the bin and mean of ``arr`` with respect to ``echo_range``. + + Parameters + ---------- + arr: dask.array.Array or np.ndarray + The 2D array whose values should be binned + digitized_echo_range: dask.array.Array or np.ndarray + 2D array of bin indices for ``echo_range`` + n_bin_er: int + The number of echo range bins + + Returns + ------- + er_means: dask.array.Array or np.ndarray + The bin and mean of ``arr`` with respect to ``echo_range`` + """ + + # compute bin indices to allow for downstream processes (mainly axis argument in unique) + if isinstance(digitized_echo_range, dask.array.Array): + digitized_echo_range = digitized_echo_range.compute() + + # determine the unique rows of digitized_echo_range and the inverse + unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True) + + # create groups of row indices using the unique inverse + grps_same_ind = [ + np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) + ] + + # for each group bin and mean arr along echo_range + # note: the values appended may not be in the correct final order + binned_er = [] + for count, grp in enumerate(grps_same_ind): + binned_er.append( + bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er) + ) + + # construct er_means and put the columns in the correct order + binned_er_array = np.hstack(binned_er) + correct_column_ind = np.argsort(np.concatenate(grps_same_ind)) + er_means = binned_er_array[:, correct_column_ind] + + return er_means + + def bin_and_mean_2d( arr: Union[dask.array.Array, np.ndarray], bins_time: np.ndarray, @@ -564,59 +618,34 @@ def bin_and_mean_2d( # obtain the bin indices for echo_range and times digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) - # TODO: clean up the below code! - + # determine if grouping of echo_range values with the same step size is necessary if comp_er_check: grouping_needed = is_grouping_needed_comprehensive(echo_range) else: grouping_needed = is_grouping_needed_less_comprehensive(echo_range) - # TODO: put the below into its own function if grouping_needed: - - # compute bin indices to allow for downstream processes (mainly axis argument in unique) - if isinstance(digitized_echo_range, dask.array.Array): - digitized_echo_range = digitized_echo_range.compute() - - unique_er_bin_ind, unique_inverse = np.unique( - digitized_echo_range, axis=0, return_inverse=True - ) - grps_same_ind = [ - np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) - ] - - binned_er = [] # the values appended may not be in the correct order - for count, grp in enumerate(grps_same_ind): - binned_er.append( - bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er) - ) - - # put the columns in the correct order - binned_er_array = np.hstack(binned_er) - correct_column_ind = np.argsort(np.concatenate(grps_same_ind)) - er_means = binned_er_array[:, correct_column_ind] - + # groups, bins, and means arr with respect to echo_range + er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) else: + # bin and mean arr with respect to echo_range er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er) # if er_means is a dask array we compute it so the graph does not get too large if isinstance(er_means, dask.array.Array): er_means = er_means.compute() - # create final reduced array + # create final reduced array i.e. MVBS final = np.empty((n_bin_time, n_bin_er - 1)) - for bin_time in range(1, n_bin_time + 1): # obtain er_mean indices corresponding to the time bin indices = np.argwhere(bin_time_ind == bin_time).flatten() if len(indices) == 0: - # fill values with NaN, if there are no values in the bin final[bin_time - 1, :] = np.nan else: - # bin and mean the er_mean time bin final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1) From 82687481fbb812777eb4420b14ba31e46f5a729b Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 23 Nov 2022 10:50:41 -0800 Subject: [PATCH 24/28] catch warnings associated with taking the nanmean of an array filled with NaNs --- echopype/preprocess/api.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index e65f53a28..29024ca8f 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -2,9 +2,11 @@ Functions for enhancing the spatial and temporal coherence of data. """ +import warnings from typing import Tuple, Union import dask.array +import dask.distributed import numpy as np import pandas as pd import xarray as xr @@ -354,8 +356,14 @@ def bin_and_mean_echo_range( binned_means = [] for bin_er in range(1, n_bin_er): - # bin and mean echo_range dimension - er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) + # Catch a known warning that can occur, which does not impact the results + with warnings.catch_warnings(): + + # ignore warnings caused by taking a mean of an array filled with NaNs + warnings.filterwarnings(action="ignore", message="Mean of empty slice") + + # bin and mean echo_range dimension + er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) # collect all echo_range bins binned_means.append(er_selected_data) From 7cadba27e08e66972f49ad32408bb8c2b969f367 Mon Sep 17 00:00:00 2001 From: b-reyes <53541061+b-reyes@users.noreply.github.com> Date: Wed, 23 Nov 2022 10:57:49 -0800 Subject: [PATCH 25/28] provide more specific docstring for arr in bin_and_mean_echo_range Co-authored-by: Wu-Jung Lee --- echopype/preprocess/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index 29024ca8f..bc258bc80 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -341,7 +341,8 @@ def bin_and_mean_echo_range( Parameters ---------- arr: np.ndarray or dask.array.Array - 2D array to bin and mean + 2D array (dimension: [``echo_range`` x ``ping_time``]) to bin along ``echo_range`` + and compute mean of each bin digitized_echo_range: np.ndarray 2D array of bin indices for ``echo_range`` n_bin_er: int From 32093d11f9b0c50019eb380d77dabd9f2ce8a1f2 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 23 Nov 2022 11:37:40 -0800 Subject: [PATCH 26/28] refactor and rename code associated with determining if grouping of echo_range is needed --- echopype/preprocess/api.py | 45 +++++++++++++++----- echopype/tests/preprocess/test_preprocess.py | 2 +- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index bc258bc80..ba1f80b5b 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -414,7 +414,7 @@ def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray: return row_ind_not_equal -def is_grouping_needed_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: +def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: """ A comprehensive check that determines if all ``echo_range`` values along ``ping_time`` have the same step size. If they do not have @@ -465,7 +465,7 @@ def is_grouping_needed_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) - return False -def is_grouping_needed_less_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: +def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: """ An alternative (less comprehensive) check that determines if all ``echo_range`` values along ``ping_time`` have the same step size. @@ -523,6 +523,34 @@ def is_grouping_needed_less_comprehensive(er_chan: Union[xr.DataArray, np.ndarra return False +def is_er_grouping_needed( + echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool +) -> bool: + """ + Determines if ``echo_range`` values along ``ping_time`` can change and + thus need to be grouped. + + Parameters + ---------- + echo_range: xr.DataArray or np.ndarray + 2D array containing the ``echo_range`` values for each ``ping_time`` + comprehensive_er_check: bool + If True, a more comprehensive check will be completed to determine if ``echo_range`` + grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done + + Returns + ------- + bool + If True grouping of ``echo_range`` will be required, else it will not + be necessary + """ + + if comprehensive_er_check: + return if_all_er_steps_identical(echo_range) + else: + return if_last_er_steps_identical(echo_range) + + def group_bin_mean_echo_range( arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: Union[np.ndarray, dask.array.Array], @@ -583,7 +611,7 @@ def bin_and_mean_2d( bins_er: np.ndarray, times: np.ndarray, echo_range: np.ndarray, - comp_er_check: bool = True, + comprehensive_er_check: bool = True, ) -> np.ndarray: """ Bins and means ``arr`` based on ``times`` and ``echo_range``, @@ -602,7 +630,7 @@ def bin_and_mean_2d( 1D array corresponding to the time values that should be binned echo_range: np.ndarray 2D array of echo range values - comp_er_check: bool + comprehensive_er_check: bool If True, a more comprehensive check will be completed to determine if ``echo_range`` grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done @@ -628,12 +656,9 @@ def bin_and_mean_2d( digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) # determine if grouping of echo_range values with the same step size is necessary - if comp_er_check: - grouping_needed = is_grouping_needed_comprehensive(echo_range) - else: - grouping_needed = is_grouping_needed_less_comprehensive(echo_range) + er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check) - if grouping_needed: + if er_grouping_needed: # groups, bins, and means arr with respect to echo_range er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) else: @@ -709,7 +734,7 @@ def get_MVBS_along_channels( bins_er=echo_range_interval, times=sv.ping_time.data, echo_range=ds["echo_range"], - comp_er_check=True, + comprehensive_er_check=True, ) # apply inverse mapping to get back to the original domain and store values diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index 40eb058ca..e9ae00f35 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -924,7 +924,7 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: # calculate MVBS for mock data set calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin, bins_er=digitize_er_bin, times=final_ping_time, - echo_range=final_er, comp_er_check=True) + echo_range=final_er, comprehensive_er_check=True) # compare known MVBS solution against its calculated counterpart assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True) From 94d640de4cd81d0d9959d8ef8017db7bddbae718 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 23 Nov 2022 11:55:37 -0800 Subject: [PATCH 27/28] move all core routines needed for compute_MVBS into the python script mvbs.py --- echopype/preprocess/api.py | 462 +----------------- echopype/preprocess/mvbs.py | 466 +++++++++++++++++++ echopype/tests/preprocess/test_preprocess.py | 2 +- 3 files changed, 468 insertions(+), 462 deletions(-) create mode 100644 echopype/preprocess/mvbs.py diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py index ba1f80b5b..27a0453fc 100644 --- a/echopype/preprocess/api.py +++ b/echopype/preprocess/api.py @@ -2,16 +2,12 @@ Functions for enhancing the spatial and temporal coherence of data. """ -import warnings -from typing import Tuple, Union - -import dask.array -import dask.distributed import numpy as np import pandas as pd import xarray as xr from ..utils.prov import echopype_prov_attrs +from .mvbs import get_MVBS_along_channels from .noise_est import NoiseEst @@ -286,459 +282,3 @@ def remove_noise(ds_Sv, ping_num, range_sample_num, noise_max=None, SNR_threshol def regrid(): return 1 - - -def get_bin_indices( - echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray -) -> Tuple[np.ndarray, np.ndarray]: - """ - Obtains the bin index of ``echo_range`` and ``times`` based - on the binning ``bins_er`` and ``bins_time``, respectively. - - Parameters - ---------- - echo_range: np.ndarray - 2D array of echo range values - bins_er: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - times: np.ndarray - 1D array corresponding to the time values that should be binned - bins_time: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``times`` - - Returns - ------- - digitized_echo_range: np.ndarray - 2D array of bin indices for ``echo_range`` - bin_time_ind: np.ndarray - 1D array of bin indices for ``times`` - """ - - # get bin index for each echo range value - digitized_echo_range = np.digitize(echo_range, bins_er, right=False) - - # turn datetime into integers, so we can use np.digitize - if isinstance(times, dask.array.Array): - times_i8 = times.compute().data.view("i8") - else: - times_i8 = times.view("i8") - - # turn datetime into integers, so we can use np.digitize - bins_time_i8 = bins_time.view("i8") - - # get bin index for each time - bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False) - - return digitized_echo_range, bin_time_ind - - -def bin_and_mean_echo_range( - arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int -) -> Union[np.ndarray, dask.array.Array]: - """ - Bins and means ``arr`` with respect to the ``echo_range`` bins. - - Parameters - ---------- - arr: np.ndarray or dask.array.Array - 2D array (dimension: [``echo_range`` x ``ping_time``]) to bin along ``echo_range`` - and compute mean of each bin - digitized_echo_range: np.ndarray - 2D array of bin indices for ``echo_range`` - n_bin_er: int - The number of echo range bins - - Returns - ------- - er_means: np.ndarray or dask.array.Array - 2D array representing the bin and mean of ``arr`` along ``echo_range`` - """ - - binned_means = [] - for bin_er in range(1, n_bin_er): - - # Catch a known warning that can occur, which does not impact the results - with warnings.catch_warnings(): - - # ignore warnings caused by taking a mean of an array filled with NaNs - warnings.filterwarnings(action="ignore", message="Mean of empty slice") - - # bin and mean echo_range dimension - er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) - - # collect all echo_range bins - binned_means.append(er_selected_data) - - # create full echo_range binned array - er_means = np.vstack(binned_means) - - return er_means - - -def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray: - """ - Obtains those row indices of ``mat`` that are not equal - to ``row``. - - Parameters - ---------- - mat: np.ndarray - 2D array with the same column dimension as the number - of elements in ``row`` - row: np.ndarray - 1D array with the same number of element elements as - the column dimension of ``mat`` - - Returns - ------- - row_ind_not_equal: np.ndarray - The row indices of ``mat`` that are not equal to ``row`` - - Notes - ----- - Elements with NaNs are considered equal if they are in the same position. - """ - - # compare row against all rows in mat (allowing for NaNs to be equal) - element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) - - # determine if mat row is equal to row - row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1)) - - if isinstance(row_not_equal, dask.array.Array): - row_not_equal = row_not_equal.compute() - - # get those row indices that are not equal to row - row_ind_not_equal = np.argwhere(row_not_equal).flatten() - - return row_ind_not_equal - - -def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: - """ - A comprehensive check that determines if all ``echo_range`` values - along ``ping_time`` have the same step size. If they do not have - the same step sizes, then grouping of the ``echo_range`` values - will be necessary. - - Parameters - ---------- - er_chan: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - - Returns - ------- - bool - True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - - Notes - ----- - ``er_chan`` should have rows corresponding to ``ping_time`` and columns - corresponding to ``range_sample`` - """ - - # grab the in-memory numpy echo_range values, if necessary - if isinstance(er_chan, xr.DataArray): - er_chan = er_chan.values - - # grab the first ping_time that is not filled with NaNs - ping_index = 0 - while np.all(np.isnan(er_chan[ping_index, :])): - ping_index += 1 - - # determine those rows of er_chan that are not equal to the row ping_index - unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :]) - - if len(unequal_ping_ind) > 0: - - # see if all unequal_ping_ind are filled with NaNs - all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1)) - - if all_nans: - # All echo_range values have the same step size - return False - else: - # Some echo_range values have different step sizes - return True - else: - # All echo_range values have the same step size - return False - - -def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: - """ - An alternative (less comprehensive) check that determines if all - ``echo_range`` values along ``ping_time`` have the same step size. - If they do not have the same step sizes, then grouping of the - ``echo_range`` values will be necessary. - - Parameters - ---------- - er_chan: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - - Returns - ------- - bool - True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - - Notes - ----- - It is possible that this method will incorrectly determine if grouping - is necessary. - - ``er_chan`` should have rows corresponding to ``ping_time`` and columns - corresponding to ``range_sample`` - """ - - # determine the number of NaNs in each ping and find the unique number of NaNs - unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1)) - - # compute the results, if necessary, to allow for downstream checks - if isinstance(unique_num_nans, dask.array.Array): - unique_num_nans = unique_num_nans.compute() - - # determine if any value is not 0 or er_chan.shape[1] - unexpected_num_nans = False in np.logical_or( - unique_num_nans == 0, unique_num_nans == er_chan.shape[1] - ) - - if unexpected_num_nans: - # echo_range varies with ping_time - return True - else: - - # make sure that the final echo_range value for each ping_time is the same (account for NaN) - num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum() - - # compute the results, if necessary, to allow for downstream checks - if isinstance(num_non_nans, dask.array.Array): - num_non_nans = num_non_nans.compute() - - if num_non_nans > 1: - # echo_range varies with ping_time - return True - else: - # echo_range does not vary with ping_time - return False - - -def is_er_grouping_needed( - echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool -) -> bool: - """ - Determines if ``echo_range`` values along ``ping_time`` can change and - thus need to be grouped. - - Parameters - ---------- - echo_range: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - comprehensive_er_check: bool - If True, a more comprehensive check will be completed to determine if ``echo_range`` - grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done - - Returns - ------- - bool - If True grouping of ``echo_range`` will be required, else it will not - be necessary - """ - - if comprehensive_er_check: - return if_all_er_steps_identical(echo_range) - else: - return if_last_er_steps_identical(echo_range) - - -def group_bin_mean_echo_range( - arr: Union[np.ndarray, dask.array.Array], - digitized_echo_range: Union[np.ndarray, dask.array.Array], - n_bin_er: int, -) -> Union[np.ndarray, dask.array.Array]: - """ - Groups the rows of ``arr`` such that they have the same corresponding - row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range`` - on each group, and lastly assembles the correctly ordered ``er_means`` array - representing the bin and mean of ``arr`` with respect to ``echo_range``. - - Parameters - ---------- - arr: dask.array.Array or np.ndarray - The 2D array whose values should be binned - digitized_echo_range: dask.array.Array or np.ndarray - 2D array of bin indices for ``echo_range`` - n_bin_er: int - The number of echo range bins - - Returns - ------- - er_means: dask.array.Array or np.ndarray - The bin and mean of ``arr`` with respect to ``echo_range`` - """ - - # compute bin indices to allow for downstream processes (mainly axis argument in unique) - if isinstance(digitized_echo_range, dask.array.Array): - digitized_echo_range = digitized_echo_range.compute() - - # determine the unique rows of digitized_echo_range and the inverse - unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True) - - # create groups of row indices using the unique inverse - grps_same_ind = [ - np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) - ] - - # for each group bin and mean arr along echo_range - # note: the values appended may not be in the correct final order - binned_er = [] - for count, grp in enumerate(grps_same_ind): - binned_er.append( - bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er) - ) - - # construct er_means and put the columns in the correct order - binned_er_array = np.hstack(binned_er) - correct_column_ind = np.argsort(np.concatenate(grps_same_ind)) - er_means = binned_er_array[:, correct_column_ind] - - return er_means - - -def bin_and_mean_2d( - arr: Union[dask.array.Array, np.ndarray], - bins_time: np.ndarray, - bins_er: np.ndarray, - times: np.ndarray, - echo_range: np.ndarray, - comprehensive_er_check: bool = True, -) -> np.ndarray: - """ - Bins and means ``arr`` based on ``times`` and ``echo_range``, - and their corresponding bins. If ``arr`` is ``Sv`` then this - will compute the MVBS. - - Parameters - ---------- - arr: dask.array.Array or np.ndarray - The 2D array whose values should be binned - bins_time: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``times`` - bins_er: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - times: np.ndarray - 1D array corresponding to the time values that should be binned - echo_range: np.ndarray - 2D array of echo range values - comprehensive_er_check: bool - If True, a more comprehensive check will be completed to determine if ``echo_range`` - grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done - - Returns - ------- - final_reduced: np.ndarray - The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS - - Notes - ----- - This function assumes that ``arr`` has rows corresponding to - ``ping_time`` and columns corresponding to ``echo_range``. - - This function should not be run if the number of ``echo_range`` values - vary amongst ``ping_times``. - """ - - # get the number of echo range and time bins - n_bin_er = len(bins_er) - n_bin_time = len(bins_time) - - # obtain the bin indices for echo_range and times - digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) - - # determine if grouping of echo_range values with the same step size is necessary - er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check) - - if er_grouping_needed: - # groups, bins, and means arr with respect to echo_range - er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) - else: - # bin and mean arr with respect to echo_range - er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er) - - # if er_means is a dask array we compute it so the graph does not get too large - if isinstance(er_means, dask.array.Array): - er_means = er_means.compute() - - # create final reduced array i.e. MVBS - final = np.empty((n_bin_time, n_bin_er - 1)) - for bin_time in range(1, n_bin_time + 1): - - # obtain er_mean indices corresponding to the time bin - indices = np.argwhere(bin_time_ind == bin_time).flatten() - - if len(indices) == 0: - # fill values with NaN, if there are no values in the bin - final[bin_time - 1, :] = np.nan - else: - # bin and mean the er_mean time bin - final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1) - - return final - - -def get_MVBS_along_channels( - ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray -) -> np.ndarray: - """ - Computes the MVBS of ``ds_Sv`` along each channel for the given - intervals. - - Parameters - ---------- - ds_Sv: xr.Dataset - A Dataset containing ``Sv`` and ``echo_range`` data with coordinates - ``channel``, ``ping_time``, and ``range_sample`` - echo_range_interval: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - ping_interval: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``ping_time`` - - Returns - ------- - np.ndarray - The MVBS value of the input ``ds_Sv`` for all channels - - Notes - ----- - If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with - respect to ``echo_range`` will take place, then the delayed result will be computed, - and lastly the binning and mean with respect to ``ping_time`` will be completed. It - is necessary to apply a compute midway through this method because Dask graph layers - get too large and this makes downstream operations very inefficient. - """ - - all_MVBS = [] - for chan in ds_Sv.channel: - - # squeeze to remove "channel" dim if present - # TODO: not sure why not already removed for the AZFP case. Investigate. - ds = ds_Sv.sel(channel=chan).squeeze() - - # average should be done in linear domain - sv = 10 ** (ds["Sv"] / 10) - - # get MVBS for channel in linear domain - chan_MVBS = bin_and_mean_2d( - sv.data, - bins_time=ping_interval, - bins_er=echo_range_interval, - times=sv.ping_time.data, - echo_range=ds["echo_range"], - comprehensive_er_check=True, - ) - - # apply inverse mapping to get back to the original domain and store values - all_MVBS.append(10 * np.log10(chan_MVBS)) - - # collect the MVBS values for each channel - return np.stack(all_MVBS, axis=0) diff --git a/echopype/preprocess/mvbs.py b/echopype/preprocess/mvbs.py new file mode 100644 index 000000000..a6d548fe5 --- /dev/null +++ b/echopype/preprocess/mvbs.py @@ -0,0 +1,466 @@ +""" +Contains core functions needed to compute the MVBS of an input dataset. +""" + +import warnings +from typing import Tuple, Union + +import dask.array +import numpy as np +import xarray as xr + + +def get_bin_indices( + echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray +) -> Tuple[np.ndarray, np.ndarray]: + """ + Obtains the bin index of ``echo_range`` and ``times`` based + on the binning ``bins_er`` and ``bins_time``, respectively. + + Parameters + ---------- + echo_range: np.ndarray + 2D array of echo range values + bins_er: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``echo_range`` + times: np.ndarray + 1D array corresponding to the time values that should be binned + bins_time: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``times`` + + Returns + ------- + digitized_echo_range: np.ndarray + 2D array of bin indices for ``echo_range`` + bin_time_ind: np.ndarray + 1D array of bin indices for ``times`` + """ + + # get bin index for each echo range value + digitized_echo_range = np.digitize(echo_range, bins_er, right=False) + + # turn datetime into integers, so we can use np.digitize + if isinstance(times, dask.array.Array): + times_i8 = times.compute().data.view("i8") + else: + times_i8 = times.view("i8") + + # turn datetime into integers, so we can use np.digitize + bins_time_i8 = bins_time.view("i8") + + # get bin index for each time + bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False) + + return digitized_echo_range, bin_time_ind + + +def bin_and_mean_echo_range( + arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int +) -> Union[np.ndarray, dask.array.Array]: + """ + Bins and means ``arr`` with respect to the ``echo_range`` bins. + + Parameters + ---------- + arr: np.ndarray or dask.array.Array + 2D array (dimension: [``echo_range`` x ``ping_time``]) to bin along ``echo_range`` + and compute mean of each bin + digitized_echo_range: np.ndarray + 2D array of bin indices for ``echo_range`` + n_bin_er: int + The number of echo range bins + + Returns + ------- + er_means: np.ndarray or dask.array.Array + 2D array representing the bin and mean of ``arr`` along ``echo_range`` + """ + + binned_means = [] + for bin_er in range(1, n_bin_er): + + # Catch a known warning that can occur, which does not impact the results + with warnings.catch_warnings(): + + # ignore warnings caused by taking a mean of an array filled with NaNs + warnings.filterwarnings(action="ignore", message="Mean of empty slice") + + # bin and mean echo_range dimension + er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) + + # collect all echo_range bins + binned_means.append(er_selected_data) + + # create full echo_range binned array + er_means = np.vstack(binned_means) + + return er_means + + +def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray: + """ + Obtains those row indices of ``mat`` that are not equal + to ``row``. + + Parameters + ---------- + mat: np.ndarray + 2D array with the same column dimension as the number + of elements in ``row`` + row: np.ndarray + 1D array with the same number of element elements as + the column dimension of ``mat`` + + Returns + ------- + row_ind_not_equal: np.ndarray + The row indices of ``mat`` that are not equal to ``row`` + + Notes + ----- + Elements with NaNs are considered equal if they are in the same position. + """ + + # compare row against all rows in mat (allowing for NaNs to be equal) + element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) + + # determine if mat row is equal to row + row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1)) + + if isinstance(row_not_equal, dask.array.Array): + row_not_equal = row_not_equal.compute() + + # get those row indices that are not equal to row + row_ind_not_equal = np.argwhere(row_not_equal).flatten() + + return row_ind_not_equal + + +def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: + """ + A comprehensive check that determines if all ``echo_range`` values + along ``ping_time`` have the same step size. If they do not have + the same step sizes, then grouping of the ``echo_range`` values + will be necessary. + + Parameters + ---------- + er_chan: xr.DataArray or np.ndarray + 2D array containing the ``echo_range`` values for each ``ping_time`` + + Returns + ------- + bool + True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False + + Notes + ----- + ``er_chan`` should have rows corresponding to ``ping_time`` and columns + corresponding to ``range_sample`` + """ + + # grab the in-memory numpy echo_range values, if necessary + if isinstance(er_chan, xr.DataArray): + er_chan = er_chan.values + + # grab the first ping_time that is not filled with NaNs + ping_index = 0 + while np.all(np.isnan(er_chan[ping_index, :])): + ping_index += 1 + + # determine those rows of er_chan that are not equal to the row ping_index + unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :]) + + if len(unequal_ping_ind) > 0: + + # see if all unequal_ping_ind are filled with NaNs + all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1)) + + if all_nans: + # All echo_range values have the same step size + return False + else: + # Some echo_range values have different step sizes + return True + else: + # All echo_range values have the same step size + return False + + +def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: + """ + An alternative (less comprehensive) check that determines if all + ``echo_range`` values along ``ping_time`` have the same step size. + If they do not have the same step sizes, then grouping of the + ``echo_range`` values will be necessary. + + Parameters + ---------- + er_chan: xr.DataArray or np.ndarray + 2D array containing the ``echo_range`` values for each ``ping_time`` + + Returns + ------- + bool + True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False + + Notes + ----- + It is possible that this method will incorrectly determine if grouping + is necessary. + + ``er_chan`` should have rows corresponding to ``ping_time`` and columns + corresponding to ``range_sample`` + """ + + # determine the number of NaNs in each ping and find the unique number of NaNs + unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1)) + + # compute the results, if necessary, to allow for downstream checks + if isinstance(unique_num_nans, dask.array.Array): + unique_num_nans = unique_num_nans.compute() + + # determine if any value is not 0 or er_chan.shape[1] + unexpected_num_nans = False in np.logical_or( + unique_num_nans == 0, unique_num_nans == er_chan.shape[1] + ) + + if unexpected_num_nans: + # echo_range varies with ping_time + return True + else: + + # make sure that the final echo_range value for each ping_time is the same (account for NaN) + num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum() + + # compute the results, if necessary, to allow for downstream checks + if isinstance(num_non_nans, dask.array.Array): + num_non_nans = num_non_nans.compute() + + if num_non_nans > 1: + # echo_range varies with ping_time + return True + else: + # echo_range does not vary with ping_time + return False + + +def is_er_grouping_needed( + echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool +) -> bool: + """ + Determines if ``echo_range`` values along ``ping_time`` can change and + thus need to be grouped. + + Parameters + ---------- + echo_range: xr.DataArray or np.ndarray + 2D array containing the ``echo_range`` values for each ``ping_time`` + comprehensive_er_check: bool + If True, a more comprehensive check will be completed to determine if ``echo_range`` + grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done + + Returns + ------- + bool + If True grouping of ``echo_range`` will be required, else it will not + be necessary + """ + + if comprehensive_er_check: + return if_all_er_steps_identical(echo_range) + else: + return if_last_er_steps_identical(echo_range) + + +def group_bin_mean_echo_range( + arr: Union[np.ndarray, dask.array.Array], + digitized_echo_range: Union[np.ndarray, dask.array.Array], + n_bin_er: int, +) -> Union[np.ndarray, dask.array.Array]: + """ + Groups the rows of ``arr`` such that they have the same corresponding + row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range`` + on each group, and lastly assembles the correctly ordered ``er_means`` array + representing the bin and mean of ``arr`` with respect to ``echo_range``. + + Parameters + ---------- + arr: dask.array.Array or np.ndarray + The 2D array whose values should be binned + digitized_echo_range: dask.array.Array or np.ndarray + 2D array of bin indices for ``echo_range`` + n_bin_er: int + The number of echo range bins + + Returns + ------- + er_means: dask.array.Array or np.ndarray + The bin and mean of ``arr`` with respect to ``echo_range`` + """ + + # compute bin indices to allow for downstream processes (mainly axis argument in unique) + if isinstance(digitized_echo_range, dask.array.Array): + digitized_echo_range = digitized_echo_range.compute() + + # determine the unique rows of digitized_echo_range and the inverse + unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True) + + # create groups of row indices using the unique inverse + grps_same_ind = [ + np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) + ] + + # for each group bin and mean arr along echo_range + # note: the values appended may not be in the correct final order + binned_er = [] + for count, grp in enumerate(grps_same_ind): + binned_er.append( + bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er) + ) + + # construct er_means and put the columns in the correct order + binned_er_array = np.hstack(binned_er) + correct_column_ind = np.argsort(np.concatenate(grps_same_ind)) + er_means = binned_er_array[:, correct_column_ind] + + return er_means + + +def bin_and_mean_2d( + arr: Union[dask.array.Array, np.ndarray], + bins_time: np.ndarray, + bins_er: np.ndarray, + times: np.ndarray, + echo_range: np.ndarray, + comprehensive_er_check: bool = True, +) -> np.ndarray: + """ + Bins and means ``arr`` based on ``times`` and ``echo_range``, + and their corresponding bins. If ``arr`` is ``Sv`` then this + will compute the MVBS. + + Parameters + ---------- + arr: dask.array.Array or np.ndarray + The 2D array whose values should be binned + bins_time: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``times`` + bins_er: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``echo_range`` + times: np.ndarray + 1D array corresponding to the time values that should be binned + echo_range: np.ndarray + 2D array of echo range values + comprehensive_er_check: bool + If True, a more comprehensive check will be completed to determine if ``echo_range`` + grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done + + Returns + ------- + final_reduced: np.ndarray + The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS + + Notes + ----- + This function assumes that ``arr`` has rows corresponding to + ``ping_time`` and columns corresponding to ``echo_range``. + + This function should not be run if the number of ``echo_range`` values + vary amongst ``ping_times``. + """ + + # get the number of echo range and time bins + n_bin_er = len(bins_er) + n_bin_time = len(bins_time) + + # obtain the bin indices for echo_range and times + digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) + + # determine if grouping of echo_range values with the same step size is necessary + er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check) + + if er_grouping_needed: + # groups, bins, and means arr with respect to echo_range + er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) + else: + # bin and mean arr with respect to echo_range + er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er) + + # if er_means is a dask array we compute it so the graph does not get too large + if isinstance(er_means, dask.array.Array): + er_means = er_means.compute() + + # create final reduced array i.e. MVBS + final = np.empty((n_bin_time, n_bin_er - 1)) + for bin_time in range(1, n_bin_time + 1): + + # obtain er_mean indices corresponding to the time bin + indices = np.argwhere(bin_time_ind == bin_time).flatten() + + if len(indices) == 0: + # fill values with NaN, if there are no values in the bin + final[bin_time - 1, :] = np.nan + else: + # bin and mean the er_mean time bin + final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1) + + return final + + +def get_MVBS_along_channels( + ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray +) -> np.ndarray: + """ + Computes the MVBS of ``ds_Sv`` along each channel for the given + intervals. + + Parameters + ---------- + ds_Sv: xr.Dataset + A Dataset containing ``Sv`` and ``echo_range`` data with coordinates + ``channel``, ``ping_time``, and ``range_sample`` + echo_range_interval: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``echo_range`` + ping_interval: np.ndarray + 1D array (used by np.digitize) representing the binning required for ``ping_time`` + + Returns + ------- + np.ndarray + The MVBS value of the input ``ds_Sv`` for all channels + + Notes + ----- + If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with + respect to ``echo_range`` will take place, then the delayed result will be computed, + and lastly the binning and mean with respect to ``ping_time`` will be completed. It + is necessary to apply a compute midway through this method because Dask graph layers + get too large and this makes downstream operations very inefficient. + """ + + all_MVBS = [] + for chan in ds_Sv.channel: + + # squeeze to remove "channel" dim if present + # TODO: not sure why not already removed for the AZFP case. Investigate. + ds = ds_Sv.sel(channel=chan).squeeze() + + # average should be done in linear domain + sv = 10 ** (ds["Sv"] / 10) + + # get MVBS for channel in linear domain + chan_MVBS = bin_and_mean_2d( + sv.data, + bins_time=ping_interval, + bins_er=echo_range_interval, + times=sv.ping_time.data, + echo_range=ds["echo_range"], + comprehensive_er_check=True, + ) + + # apply inverse mapping to get back to the original domain and store values + all_MVBS.append(10 * np.log10(chan_MVBS)) + + # collect the MVBS values for each channel + return np.stack(all_MVBS, axis=0) diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py index e9ae00f35..4e96986b3 100644 --- a/echopype/tests/preprocess/test_preprocess.py +++ b/echopype/tests/preprocess/test_preprocess.py @@ -6,7 +6,7 @@ import dask.array from numpy.random import default_rng -from echopype.preprocess.api import bin_and_mean_2d +from echopype.preprocess.mvbs import bin_and_mean_2d from typing import Tuple, Iterable, Union From 7b8fb36ff225092a491e07b361131663172fbf90 Mon Sep 17 00:00:00 2001 From: b-reyes Date: Wed, 23 Nov 2022 13:24:28 -0800 Subject: [PATCH 28/28] add note to docstring of bin_and_mean_2d and rename group_bin_mean_echo_range to group_dig_er_bin_mean_echo_range --- echopype/preprocess/mvbs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/echopype/preprocess/mvbs.py b/echopype/preprocess/mvbs.py index a6d548fe5..a08ebf8a9 100644 --- a/echopype/preprocess/mvbs.py +++ b/echopype/preprocess/mvbs.py @@ -273,7 +273,7 @@ def is_er_grouping_needed( return if_last_er_steps_identical(echo_range) -def group_bin_mean_echo_range( +def group_dig_er_bin_mean_echo_range( arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: Union[np.ndarray, dask.array.Array], n_bin_er: int, @@ -367,7 +367,8 @@ def bin_and_mean_2d( ``ping_time`` and columns corresponding to ``echo_range``. This function should not be run if the number of ``echo_range`` values - vary amongst ``ping_times``. + vary amongst ``ping_times``. This should not occur for our current use + of echopype-generated Sv data. """ # get the number of echo range and time bins @@ -382,7 +383,7 @@ def bin_and_mean_2d( if er_grouping_needed: # groups, bins, and means arr with respect to echo_range - er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) + er_means = group_dig_er_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) else: # bin and mean arr with respect to echo_range er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er)