From 921a4d9b86071fcbe566493e2f2c47773fe68775 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Mon, 10 Oct 2022 17:07:31 -0700
Subject: [PATCH 01/28] establish core function for binning and computing the
 mean of a 2D array

---
 echopype/preprocess/api.py | 63 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 5b6e4be02..a0b05ec2c 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 import xarray as xr
+import dask.array
 
 from ..utils.prov import echopype_prov_attrs
 from .noise_est import NoiseEst
@@ -329,3 +330,65 @@ def remove_noise(ds_Sv, ping_num, range_sample_num, noise_max=None, SNR_threshol
 
 def regrid():
     return 1
+
+
+def scale_array(array_vals, new_min, new_max):
+    old_max = array_vals.max()
+    old_min = array_vals.min()
+
+    # TODO: error out for old_max == old_min
+
+    old_range = (old_max - old_min)
+    new_range = (new_max - new_min)
+    new_array = (((array_vals - old_min) * new_range) / old_range) + new_min
+
+    return new_array
+
+
+def create_known_mean_data():
+
+    num_pings_in_bin = [4, 5, 6]
+    num_er_in_bin = [10, 20, 11]
+
+    er_bin_ranges = [(0, 5), (5.1, 10), (10.1, 20)]
+
+    arrays = []
+    means = []
+    for count, er_bin in enumerate(num_er_in_bin):
+
+        temp = []
+        for ping_bin in num_pings_in_bin:
+            a = np.random.random_sample((er_bin, ping_bin))
+
+            # scale a
+            a = scale_array(a, er_bin_ranges[count][0], er_bin_ranges[count][1])
+
+            temp.append(a)
+
+        means.append([arr.mean() for arr in temp])
+        arrays.append(np.concatenate(temp, axis=1))
+
+    full_array = np.concatenate(arrays, axis=0)
+
+
+def bin_and_mean(arr, bins_ping, bins_er, pings):
+
+    if isinstance(pings, dask.array):
+        bin_ping_ind = np.digitize(pings.compute(), bins_ping, right=False)
+    else:
+        bin_ping_ind = np.digitize(pings, bins_ping, right=False)
+
+    # possible bin value, note: 0 means less than first bin val,
+    # and len(bins) + 1 is greater than the last bin value
+    n_bin_ping = len(bins_ping) + 1
+    n_bin_er = len(bins_er) + 1
+
+    binned_means = []
+    for bin_ping in range(n_bin_ping):
+
+        da_binned_ping = arr[:, bin_ping_ind == bin_ping]
+
+        bin_er_ind = np.digitize(da_binned_ping, bins_er, right=False)
+
+        for bin_er in range(n_bin_er):
+            binned_means.append(np.nanmean(da_binned_ping[bin_er_ind == bin_er]))
\ No newline at end of file

From 12c0d69f42d4f328c59c47d51921eaa4c7023573 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Tue, 11 Oct 2022 10:22:16 -0700
Subject: [PATCH 02/28] finish structure of bin_and_mean_2d and create simple
 tests for it

---
 echopype/preprocess/api.py                   | 71 +++++++-----------
 echopype/tests/preprocess/test_preprocess.py | 76 ++++++++++++++++++++
 2 files changed, 103 insertions(+), 44 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index a0b05ec2c..9234e6ff5 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -2,10 +2,10 @@
 Functions for enhancing the spatial and temporal coherence of data.
 """
 
+import dask.array
 import numpy as np
 import pandas as pd
 import xarray as xr
-import dask.array
 
 from ..utils.prov import echopype_prov_attrs
 from .noise_est import NoiseEst
@@ -332,63 +332,46 @@ def regrid():
     return 1
 
 
-def scale_array(array_vals, new_min, new_max):
-    old_max = array_vals.max()
-    old_min = array_vals.min()
-
-    # TODO: error out for old_max == old_min
-
-    old_range = (old_max - old_min)
-    new_range = (new_max - new_min)
-    new_array = (((array_vals - old_min) * new_range) / old_range) + new_min
-
-    return new_array
-
-
-def create_known_mean_data():
-
-    num_pings_in_bin = [4, 5, 6]
-    num_er_in_bin = [10, 20, 11]
-
-    er_bin_ranges = [(0, 5), (5.1, 10), (10.1, 20)]
-
-    arrays = []
-    means = []
-    for count, er_bin in enumerate(num_er_in_bin):
-
-        temp = []
-        for ping_bin in num_pings_in_bin:
-            a = np.random.random_sample((er_bin, ping_bin))
-
-            # scale a
-            a = scale_array(a, er_bin_ranges[count][0], er_bin_ranges[count][1])
+def bin_and_mean_2d(arr, bins_ping, bins_er, pings):
+    """
+    Bins and means a 2D array
 
-            temp.append(a)
 
-        means.append([arr.mean() for arr in temp])
-        arrays.append(np.concatenate(temp, axis=1))
+    bins_ping, bins_er are bins in np.digitize format
 
-    full_array = np.concatenate(arrays, axis=0)
+    TODO: document!
 
+    Notes
+    -----
+    This function assumes that values in ``arr`` are not outside
+    of the bins presecribed by ``bins_er``. Additionally, it also
+    assumes that values in ``pings`` are not outside the bins given
+    by bins_ping.
+    """
 
-def bin_and_mean(arr, bins_ping, bins_er, pings):
+    # TODO: put in check to make sure arr is 2D
 
-    if isinstance(pings, dask.array):
+    if isinstance(pings, dask.array.Array):
         bin_ping_ind = np.digitize(pings.compute(), bins_ping, right=False)
     else:
         bin_ping_ind = np.digitize(pings, bins_ping, right=False)
 
-    # possible bin value, note: 0 means less than first bin val,
-    # and len(bins) + 1 is greater than the last bin value
-    n_bin_ping = len(bins_ping) + 1
-    n_bin_er = len(bins_er) + 1
+    n_bin_ping = len(bins_ping)
+    n_bin_er = len(bins_er)
 
     binned_means = []
-    for bin_ping in range(n_bin_ping):
+    for bin_ping in range(1, n_bin_ping):
 
         da_binned_ping = arr[:, bin_ping_ind == bin_ping]
 
         bin_er_ind = np.digitize(da_binned_ping, bins_er, right=False)
 
-        for bin_er in range(n_bin_er):
-            binned_means.append(np.nanmean(da_binned_ping[bin_er_ind == bin_er]))
\ No newline at end of file
+        for bin_er in range(1, n_bin_er):
+            binned_means.append(np.nanmean(da_binned_ping[bin_er_ind == bin_er]))
+
+    if isinstance(arr, dask.array.Array):
+        means = dask.array.from_array(binned_means)
+    else:
+        means = np.array(binned_means)
+
+    return means.reshape((n_bin_ping - 1, n_bin_er - 1))
diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index 58be7cff0..e2876a6d7 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -3,6 +3,9 @@
 import xarray as xr
 import echopype as ep
 import pytest
+import dask.array
+
+from echopype.preprocess.api import bin_and_mean_2d
 
 
 @pytest.fixture(
@@ -507,3 +510,76 @@ def test_preprocess_mvbs(test_data_samples):
             range_kwargs.pop('azfp_cal_type')
     Sv = ep.calibrate.compute_Sv(ed, **range_kwargs)
     assert ep.preprocess.compute_MVBS(Sv) is not None
+
+
+def create_bins(csum_array):
+    # TODO: document!
+    bins = []
+    for count, csum in enumerate(csum_array):
+
+        if count == 0:
+            bins.append((0, csum))
+        else:
+            # add 0.01 so that left bins don't overlap
+            bins.append((csum_array[count-1] + 0.01, csum))
+    return bins
+
+
+def create_known_mean_data(create_dask, num_pings_in_bin, num_er_in_bin):
+    # TODO: document!
+
+    ping_csum = np.cumsum(num_pings_in_bin)
+    ping_bins = create_bins(ping_csum)
+
+    er_csum = np.cumsum(num_er_in_bin)
+    er_bins = create_bins(er_csum)
+
+    arrays = []
+    means = []
+    for ping_bin in num_pings_in_bin:
+
+        temp = []
+        for count, bin_val in enumerate(er_bins):
+
+            if create_dask:
+                a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[count], ping_bin))
+            else:
+                a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[count], ping_bin))
+
+            temp.append(a)
+
+        means.append([arr.mean() for arr in temp])
+        arrays.append(np.concatenate(temp, axis=0))
+
+    if create_dask:
+        means = dask.array.from_array(means).compute()
+
+    full_array = np.concatenate(arrays, axis=1)
+
+    return means, full_array, ping_bins, er_bins
+
+
+def test_bin_and_mean_2d():
+    # TODO: document and create a fixture with input of create_dask
+
+    create_dask = True
+
+    num_pings_in_bin = np.random.randint(low=10, high=1000, size=10)
+    num_er_in_bin = np.random.randint(low=10, high=1000, size=10)
+
+    means, full_array, ping_bins, er_bins = create_known_mean_data(create_dask,
+                                                                   num_pings_in_bin, num_er_in_bin)
+
+    ping_times = np.concatenate([np.random.uniform(bin_val[0], bin_val[1], num_pings_in_bin[count])
+                                 for count, bin_val in enumerate(ping_bins)])
+
+    digitize_ping_bin = [*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:]]
+    digitize_er_bin = [*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]
+
+    calc_means = bin_and_mean_2d(full_array, digitize_ping_bin, digitize_er_bin, ping_times)
+
+    if create_dask:
+        calc_means = calc_means.compute()
+
+    assert np.allclose(calc_means, means, atol=1e-10, rtol=1e-10)
+

From 0141f8ec1cd752f4423aa9ea3aed4ac8bb206157 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Mon, 17 Oct 2022 17:06:09 -0700
Subject: [PATCH 03/28] modify bin_and_mean_2d so that it produces the
 appropriate values and add get_MVBS_along_channels, which computes MVBS for
 each channel

---
 echopype/preprocess/api.py | 85 +++++++++++++++++++++++++++++++++-----
 1 file changed, 74 insertions(+), 11 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 9234e6ff5..ac062a8cb 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -332,7 +332,7 @@ def regrid():
     return 1
 
 
-def bin_and_mean_2d(arr, bins_ping, bins_er, pings):
+def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range):
     """
     Bins and means a 2D array
 
@@ -351,27 +351,90 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings):
 
     # TODO: put in check to make sure arr is 2D
 
+    # turn datetime into integers, so we can use np.digitize
     if isinstance(pings, dask.array.Array):
-        bin_ping_ind = np.digitize(pings.compute(), bins_ping, right=False)
+        pings_i8 = pings.compute().data.view("i8")
     else:
-        bin_ping_ind = np.digitize(pings, bins_ping, right=False)
+        pings_i8 = pings.view("i8")
+
+    # turn datetime into integers, so we can use np.digitize
+    bins_ping_i8 = bins_ping.view("i8")
+
+    # get bin index for each ping
+    bin_ping_ind = np.digitize(pings_i8, bins_ping_i8, right=False)
+
+    # get bin index for each echo_range value
+    if isinstance(echo_range, dask.array.Array):
+        bin_er_ind = np.digitize(echo_range.compute(), bins_er, right=False)
+    else:
+        bin_er_ind = np.digitize(echo_range, bins_er, right=False)
 
     n_bin_ping = len(bins_ping)
     n_bin_er = len(bins_er)
 
     binned_means = []
-    for bin_ping in range(1, n_bin_ping):
-
-        da_binned_ping = arr[:, bin_ping_ind == bin_ping]
+    for bin_ping in range(1, n_bin_ping + 1):
 
-        bin_er_ind = np.digitize(da_binned_ping, bins_er, right=False)
+        # ping_selected_data = arr[bin_ping_ind == bin_ping, :]
+        ping_selected_data = np.nanmean(arr[bin_ping_ind == bin_ping, :], axis=0)
 
+        data_rows = []
+        # TODO: can we do a mapping here, instead of a for loop?
         for bin_er in range(1, n_bin_er):
-            binned_means.append(np.nanmean(da_binned_ping[bin_er_ind == bin_er]))
+            # data_rows.append(np.nanmean(ping_selected_data[:, bin_er_ind == bin_er]))
+            data_rows.append(np.nanmean(ping_selected_data[bin_er_ind == bin_er]))
+
+        # TODO:: can we use something besides stack that is more efficient?
+        binned_means.append(np.stack(data_rows, axis=0))
 
     if isinstance(arr, dask.array.Array):
-        means = dask.array.from_array(binned_means)
+        return 10 * np.log10(np.stack(binned_means, axis=0).rechunk("auto"))
     else:
-        means = np.array(binned_means)
+        return 10 * np.log10(np.stack(binned_means, axis=0))
+
+
+def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
+
+    # range_meter_bin = 5
+    # range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin)
+    # ping_interval = np.array(list(ds_Sv.ping_time.
+    # resample(ping_time='20s', skipna=True).groups.keys()))
+
+    all_MVBS = []
+    for chan in ds_Sv.channel:
+
+        # squeeze to remove "channel" dim if present
+        # TODO: not sure why not already removed for the AZFP case. Investigate.
+        ds = ds_Sv.sel(channel=chan).squeeze()
+
+        # average should be done in linear domain
+        sv = 10 ** (ds["Sv"] / 10)
+
+        # set 1D coordinate using the 1st ping echo_range since identical for all pings
+        # remove padded NaN entries if exist for all pings
+        er = (
+            ds["echo_range"]
+            .dropna(dim="range_sample", how="all")
+            .dropna(dim="ping_time")
+            .isel(ping_time=0)
+        )
+
+        # use first ping er as indexer for sv
+        sv = sv.sel(range_sample=er.range_sample.values)
+        sv.coords["echo_range"] = (["range_sample"], er.values)
+        sv = sv.swap_dims({"range_sample": "echo_range"})
+
+        # collect the MVBS calculated for the channel
+        all_MVBS.append(
+            bin_and_mean_2d(
+                sv.data,
+                bins_ping=ping_interval,
+                bins_er=range_interval,
+                pings=sv.ping_time.data,
+                echo_range=sv.echo_range.data,
+            )
+        )
+
+    MVBS_values = np.stack(all_MVBS, axis=0)
 
-    return means.reshape((n_bin_ping - 1, n_bin_er - 1))
+    return MVBS_values

From 7e230e3271d4bb706bed00e7cc1dfe52a164aca2 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Tue, 18 Oct 2022 12:08:27 -0700
Subject: [PATCH 04/28] investigate more efficient dask graph

---
 echopype/preprocess/api.py | 67 +++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 22 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index ac062a8cb..838baa30a 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -373,24 +373,48 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range):
     n_bin_er = len(bins_er)
 
     binned_means = []
-    for bin_ping in range(1, n_bin_ping + 1):
-
-        # ping_selected_data = arr[bin_ping_ind == bin_ping, :]
-        ping_selected_data = np.nanmean(arr[bin_ping_ind == bin_ping, :], axis=0)
-
-        data_rows = []
-        # TODO: can we do a mapping here, instead of a for loop?
-        for bin_er in range(1, n_bin_er):
-            # data_rows.append(np.nanmean(ping_selected_data[:, bin_er_ind == bin_er]))
-            data_rows.append(np.nanmean(ping_selected_data[bin_er_ind == bin_er]))
-
-        # TODO:: can we use something besides stack that is more efficient?
-        binned_means.append(np.stack(data_rows, axis=0))
-
-    if isinstance(arr, dask.array.Array):
-        return 10 * np.log10(np.stack(binned_means, axis=0).rechunk("auto"))
-    else:
-        return 10 * np.log10(np.stack(binned_means, axis=0))
+    for bin_er in range(1, n_bin_er):
+        # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
+
+        # er_selected_data = arr[:, bin_er_ind == bin_er]
+        er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
+        #
+        # binned_means.append(er_selected_data)
+
+        data_rows = []  # TODO: rename?
+        for bin_ping in range(1, n_bin_ping + 1):
+            # data_rows.append(np.nanmean(er_selected_data[bin_ping_ind == bin_ping]))
+            # data_rows.append(er_selected_data[bin_ping_ind == bin_ping, :])
+            data_rows.append(er_selected_data[bin_ping_ind == bin_ping])
+        #
+        # binned_means.append(data_rows)
+        temp = np.concatenate(data_rows, axis=0)
+        temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk('auto')  #[None, None], chunks=(1, 1))
+        binned_means.append(temp)
+
+    stacked_means = np.transpose(np.vstack(binned_means)).rechunk('auto')
+    # return binned_means
+    return 10 * np.log10(stacked_means)
+
+    # binned_means = []
+    # for bin_ping in range(1, n_bin_ping + 1):
+    #
+    #     # ping_selected_data = arr[bin_ping_ind == bin_ping, :]
+    #     ping_selected_data = np.nanmean(arr[bin_ping_ind == bin_ping, :], axis=0)
+    #
+    #     data_rows = []
+    #     # TODO: can we do a mapping here, instead of a for loop?
+    #     for bin_er in range(1, n_bin_er):
+    #         # data_rows.append(np.nanmean(ping_selected_data[:, bin_er_ind == bin_er]))
+    #         data_rows.append(np.nanmean(ping_selected_data[bin_er_ind == bin_er]))
+    #
+    #     # TODO:: can we use something besides stack that is more efficient?
+    #     binned_means.append(np.stack(data_rows, axis=0))
+
+    # if isinstance(arr, dask.array.Array):
+    #     return 10 * np.log10(np.transpose(np.stack(binned_means, axis=0))) #.rechunk("auto")))
+    # else:
+    #     return 10 * np.log10(np.transpose(np.stack(binned_means, axis=0)))
 
 
 def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
@@ -401,7 +425,7 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
     # resample(ping_time='20s', skipna=True).groups.keys()))
 
     all_MVBS = []
-    for chan in ds_Sv.channel:
+    for chan in ds_Sv.channel[:1]:
 
         # squeeze to remove "channel" dim if present
         # TODO: not sure why not already removed for the AZFP case. Investigate.
@@ -435,6 +459,5 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
             )
         )
 
-    MVBS_values = np.stack(all_MVBS, axis=0)
-
-    return MVBS_values
+    # return all_MVBS
+    return np.stack(all_MVBS, axis=0)

From 43eb06294c11b3ff499330ab78301b46c1337120 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Wed, 19 Oct 2022 10:09:23 -0700
Subject: [PATCH 05/28] investigate chunking of MVBS computation

---
 echopype/preprocess/api.py | 50 +++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 838baa30a..3cebcbe04 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -372,28 +372,51 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range):
     n_bin_ping = len(bins_ping)
     n_bin_er = len(bins_er)
 
+    # binned_means = []
+    # for bin_er in range(1, n_bin_er):
+    #     # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
+    #     er_selected_data = arr[:, bin_er_ind == bin_er]
+    #
+    #     temp = []
+    #     for bin_ping in range(1, n_bin_ping + 1):
+    #         indices = np.argwhere(bin_ping_ind == bin_ping).flatten()
+    #         temp.append(er_selected_data[indices, :])
+    #
+    #     binned_means.append(temp)
+    #
+    # return binned_means
+
+    # binned_means = []
+    # for bin_er in range(1, n_bin_er):
+    #     er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
+    #     # er_selected_data = arr[:, bin_er_ind == bin_er]
+    #
+    #     binned_means.append(er_selected_data)
+    #
+    # temp = np.vstack(binned_means)
+    #
+    # temp2 = []
+    # for bin_ping in range(1, n_bin_ping + 1):
+    #     indices = np.argwhere(bin_ping_ind == bin_ping).flatten()
+    #     temp2.append(10 * np.log10(np.nanmean(temp[:, indices], axis=1)))
+
+    # return np.vstack(temp2)
+
+    # return binned_means
+
     binned_means = []
     for bin_er in range(1, n_bin_er):
-        # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
-
-        # er_selected_data = arr[:, bin_er_ind == bin_er]
         er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
-        #
-        # binned_means.append(er_selected_data)
 
         data_rows = []  # TODO: rename?
         for bin_ping in range(1, n_bin_ping + 1):
-            # data_rows.append(np.nanmean(er_selected_data[bin_ping_ind == bin_ping]))
-            # data_rows.append(er_selected_data[bin_ping_ind == bin_ping, :])
             data_rows.append(er_selected_data[bin_ping_ind == bin_ping])
-        #
-        # binned_means.append(data_rows)
+
         temp = np.concatenate(data_rows, axis=0)
-        temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk('auto')  #[None, None], chunks=(1, 1))
+        temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk("auto")
         binned_means.append(temp)
 
-    stacked_means = np.transpose(np.vstack(binned_means)).rechunk('auto')
-    # return binned_means
+    stacked_means = np.transpose(np.vstack(binned_means)).rechunk("auto")
     return 10 * np.log10(stacked_means)
 
     # binned_means = []
@@ -425,7 +448,7 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
     # resample(ping_time='20s', skipna=True).groups.keys()))
 
     all_MVBS = []
-    for chan in ds_Sv.channel[:1]:
+    for chan in ds_Sv.channel:
 
         # squeeze to remove "channel" dim if present
         # TODO: not sure why not already removed for the AZFP case. Investigate.
@@ -459,5 +482,4 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
             )
         )
 
-    # return all_MVBS
     return np.stack(all_MVBS, axis=0)

From fd91e3abb41a90c18542ef01cc8b6431e5f630f8 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Wed, 19 Oct 2022 13:26:25 -0700
Subject: [PATCH 06/28] add new approach that computes mean binned echorange
 values and then bins and means ping_time

---
 echopype/preprocess/api.py | 60 ++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 3cebcbe04..67a2451f7 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -372,19 +372,35 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range):
     n_bin_ping = len(bins_ping)
     n_bin_er = len(bins_er)
 
+    binned_means = []
+    for bin_er in range(1, n_bin_er):
+        er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
+
+        binned_means.append(er_selected_data)
+
+    er_means = np.vstack(binned_means).compute()
+
+    final = np.empty((n_bin_ping, n_bin_er - 1))
+    for bin_ping in range(1, n_bin_ping + 1):
+        indices = np.argwhere(bin_ping_ind == bin_ping).flatten()
+        final[bin_ping - 1, :] = 10 * np.log10(np.nanmean(er_means[:, indices], axis=1))
+
+    return final
+    #
     # binned_means = []
     # for bin_er in range(1, n_bin_er):
-    #     # er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
-    #     er_selected_data = arr[:, bin_er_ind == bin_er]
+    #     er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
     #
-    #     temp = []
-    #     for bin_ping in range(1, n_bin_ping + 1):
-    #         indices = np.argwhere(bin_ping_ind == bin_ping).flatten()
-    #         temp.append(er_selected_data[indices, :])
+    #     binned_means.append(er_selected_data)
     #
-    #     binned_means.append(temp)
+    # er_means = np.vstack(binned_means)
     #
-    # return binned_means
+    # final_list = []
+    # for bin_ping in range(1, n_bin_ping + 1):
+    #     indices = np.argwhere(bin_ping_ind == bin_ping).flatten()
+    #     final_list.append(10 * np.log10(np.nanmean(er_means[:, indices], axis=1)))
+    #
+    # return np.vstack(final_list)
 
     # binned_means = []
     # for bin_er in range(1, n_bin_er):
@@ -404,20 +420,20 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range):
 
     # return binned_means
 
-    binned_means = []
-    for bin_er in range(1, n_bin_er):
-        er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
-
-        data_rows = []  # TODO: rename?
-        for bin_ping in range(1, n_bin_ping + 1):
-            data_rows.append(er_selected_data[bin_ping_ind == bin_ping])
-
-        temp = np.concatenate(data_rows, axis=0)
-        temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk("auto")
-        binned_means.append(temp)
-
-    stacked_means = np.transpose(np.vstack(binned_means)).rechunk("auto")
-    return 10 * np.log10(stacked_means)
+    # binned_means = []
+    # for bin_er in range(1, n_bin_er):
+    #     er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
+    #
+    #     data_rows = []  # TODO: rename?
+    #     for bin_ping in range(1, n_bin_ping + 1):
+    #         data_rows.append(er_selected_data[bin_ping_ind == bin_ping])
+    #
+    #     temp = np.concatenate(data_rows, axis=0)
+    #     temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk("auto")
+    #     binned_means.append(temp)
+    #
+    # stacked_means = np.transpose(np.vstack(binned_means)).rechunk("auto")
+    # return 10 * np.log10(stacked_means)
 
     # binned_means = []
     # for bin_ping in range(1, n_bin_ping + 1):

From 9b85afbd1abaa46d3810c50775576c954ec4caca Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Thu, 20 Oct 2022 09:55:32 -0700
Subject: [PATCH 07/28] remove commented out lines and add a function for bin
 and meaning the times

---
 echopype/preprocess/api.py | 134 +++++++++++++------------------------
 1 file changed, 46 insertions(+), 88 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 67a2451f7..b594ae5bb 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -332,7 +332,7 @@ def regrid():
     return 1
 
 
-def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range):
+def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range):
     """
     Bins and means a 2D array
 
@@ -351,109 +351,67 @@ def bin_and_mean_2d(arr, bins_ping, bins_er, pings, echo_range):
 
     # TODO: put in check to make sure arr is 2D
 
-    # turn datetime into integers, so we can use np.digitize
-    if isinstance(pings, dask.array.Array):
-        pings_i8 = pings.compute().data.view("i8")
-    else:
-        pings_i8 = pings.view("i8")
-
-    # turn datetime into integers, so we can use np.digitize
-    bins_ping_i8 = bins_ping.view("i8")
-
-    # get bin index for each ping
-    bin_ping_ind = np.digitize(pings_i8, bins_ping_i8, right=False)
-
     # get bin index for each echo_range value
     if isinstance(echo_range, dask.array.Array):
         bin_er_ind = np.digitize(echo_range.compute(), bins_er, right=False)
     else:
         bin_er_ind = np.digitize(echo_range, bins_er, right=False)
 
-    n_bin_ping = len(bins_ping)
     n_bin_er = len(bins_er)
 
     binned_means = []
     for bin_er in range(1, n_bin_er):
+
         er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
 
         binned_means.append(er_selected_data)
 
     er_means = np.vstack(binned_means).compute()
 
-    final = np.empty((n_bin_ping, n_bin_er - 1))
-    for bin_ping in range(1, n_bin_ping + 1):
-        indices = np.argwhere(bin_ping_ind == bin_ping).flatten()
-        final[bin_ping - 1, :] = 10 * np.log10(np.nanmean(er_means[:, indices], axis=1))
+    # turn datetime into integers, so we can use np.digitize
+    if isinstance(times, dask.array.Array):
+        times_i8 = times.compute().data.view("i8")
+    else:
+        times_i8 = times.view("i8")
+
+    # turn datetime into integers, so we can use np.digitize
+    bins_time_i8 = bins_time.view("i8")
+
+    final = bin_and_mean_times(er_means, bins_time_i8, times_i8)
+
+    return final
+
+
+def bin_and_mean_times(er_means, bins_time_i8, times_i8):
+    """
+
+
+    Parameters
+    ----------
+    er_means
+    bins_time
+    times
+
+    Returns
+    -------
+
+    Notes
+    -----
+    This function assumes that ``times`` corresponds to the
+    columns of ``er_means``.
+    """
+
+    # get bin index for each time
+    bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False)
+
+    n_bin_time = len(bins_time_i8)
+
+    final = np.empty((n_bin_time, er_means.shape[0]))
+    for bin_time in range(1, n_bin_time + 1):
+        indices = np.argwhere(bin_time_ind == bin_time).flatten()
+        final[bin_time - 1, :] = 10 * np.log10(np.nanmean(er_means[:, indices], axis=1))
 
     return final
-    #
-    # binned_means = []
-    # for bin_er in range(1, n_bin_er):
-    #     er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
-    #
-    #     binned_means.append(er_selected_data)
-    #
-    # er_means = np.vstack(binned_means)
-    #
-    # final_list = []
-    # for bin_ping in range(1, n_bin_ping + 1):
-    #     indices = np.argwhere(bin_ping_ind == bin_ping).flatten()
-    #     final_list.append(10 * np.log10(np.nanmean(er_means[:, indices], axis=1)))
-    #
-    # return np.vstack(final_list)
-
-    # binned_means = []
-    # for bin_er in range(1, n_bin_er):
-    #     er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
-    #     # er_selected_data = arr[:, bin_er_ind == bin_er]
-    #
-    #     binned_means.append(er_selected_data)
-    #
-    # temp = np.vstack(binned_means)
-    #
-    # temp2 = []
-    # for bin_ping in range(1, n_bin_ping + 1):
-    #     indices = np.argwhere(bin_ping_ind == bin_ping).flatten()
-    #     temp2.append(10 * np.log10(np.nanmean(temp[:, indices], axis=1)))
-
-    # return np.vstack(temp2)
-
-    # return binned_means
-
-    # binned_means = []
-    # for bin_er in range(1, n_bin_er):
-    #     er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
-    #
-    #     data_rows = []  # TODO: rename?
-    #     for bin_ping in range(1, n_bin_ping + 1):
-    #         data_rows.append(er_selected_data[bin_ping_ind == bin_ping])
-    #
-    #     temp = np.concatenate(data_rows, axis=0)
-    #     temp = temp.map_blocks(lambda x: np.nanmean(x), chunks=(1,)).rechunk("auto")
-    #     binned_means.append(temp)
-    #
-    # stacked_means = np.transpose(np.vstack(binned_means)).rechunk("auto")
-    # return 10 * np.log10(stacked_means)
-
-    # binned_means = []
-    # for bin_ping in range(1, n_bin_ping + 1):
-    #
-    #     # ping_selected_data = arr[bin_ping_ind == bin_ping, :]
-    #     ping_selected_data = np.nanmean(arr[bin_ping_ind == bin_ping, :], axis=0)
-    #
-    #     data_rows = []
-    #     # TODO: can we do a mapping here, instead of a for loop?
-    #     for bin_er in range(1, n_bin_er):
-    #         # data_rows.append(np.nanmean(ping_selected_data[:, bin_er_ind == bin_er]))
-    #         data_rows.append(np.nanmean(ping_selected_data[bin_er_ind == bin_er]))
-    #
-    #     # TODO:: can we use something besides stack that is more efficient?
-    #     binned_means.append(np.stack(data_rows, axis=0))
-
-    # if isinstance(arr, dask.array.Array):
-    #     return 10 * np.log10(np.transpose(np.stack(binned_means, axis=0))) #.rechunk("auto")))
-    # else:
-    #     return 10 * np.log10(np.transpose(np.stack(binned_means, axis=0)))
 
 
 def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
@@ -491,9 +449,9 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
         all_MVBS.append(
             bin_and_mean_2d(
                 sv.data,
-                bins_ping=ping_interval,
+                bins_time=ping_interval,
                 bins_er=range_interval,
-                pings=sv.ping_time.data,
+                times=sv.ping_time.data,
                 echo_range=sv.echo_range.data,
             )
         )

From 44adc23c15c5b944f71aa85ba822206ace3579a7 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Thu, 3 Nov 2022 17:06:12 -0700
Subject: [PATCH 08/28] start creating a bin and mean 2d method that accounts
 for different echo_range values for each ping time

---
 echopype/preprocess/api.py                   | 150 +++++++++++++++++--
 echopype/tests/preprocess/test_preprocess.py |  13 +-
 2 files changed, 144 insertions(+), 19 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index b594ae5bb..e84e8ae69 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -2,6 +2,8 @@
 Functions for enhancing the spatial and temporal coherence of data.
 """
 
+from typing import Tuple
+
 import dask.array
 import numpy as np
 import pandas as pd
@@ -382,6 +384,115 @@ def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range):
     return final
 
 
+def get_bin_indices(
+    echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Obtains the bin index of ``echo_range`` and ``times`` based
+    on the binning ``bins_er`` and ``bins_time``, respectively.
+
+    Parameters
+    ----------
+    echo_range: np.ndarray
+        2D array of echo range values
+    bins_er: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``echo_range``
+    times: np.ndarray
+        1D array corresponding to the time values that should be binned
+    bins_time: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``times``
+
+    Returns
+    -------
+    digitized_echo_range: np.ndarray
+        2D array of bin indices for ``echo_range``
+    bin_time_ind: np.ndarray
+        1D array of bin indices for ``times``
+    """
+
+    # get bin index for each echo range value
+    digitized_echo_range = np.digitize(echo_range, bins_er, right=False)
+
+    # turn datetime into integers, so we can use np.digitize
+    if isinstance(times, dask.array.Array):
+        times_i8 = times.compute().data.view("i8")
+    else:
+        times_i8 = times.view("i8")
+
+    # turn datetime into integers, so we can use np.digitize
+    bins_time_i8 = bins_time.view("i8")
+
+    # get bin index for each time
+    bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False)
+
+    return digitized_echo_range, bin_time_ind
+
+
+def mean_temp_arr(n_bin_er, temp_arr, dig_er_subset):
+
+    means = []
+    for bin_er in range(1, n_bin_er):
+        means.append(10 * np.log10(np.nanmean(temp_arr[dig_er_subset == bin_er], axis=0)))
+
+    return means
+
+
+def bin_and_mean_2d_v2(arr, bins_time, bins_er, times, echo_range):
+    """
+
+
+    Returns
+    -------
+
+
+    Notes
+    -----
+    This function assumes that ``arr`` has rows corresponding to
+    ping time and columns corresponding to echo range.
+    """
+
+    # determine if array to bin is lazy or not
+    is_lazy = False
+    if isinstance(arr, dask.array.Array):
+        is_lazy = True
+
+    print(f"is_lazy = {is_lazy}")
+
+    # get the number of echo range and time bins
+    n_bin_er = len(bins_er)
+    n_bin_time = len(bins_time)
+
+    # obtain the bin indices for echo_range and times
+    digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time)
+
+    all_means = []
+    for bin_time in range(1, n_bin_time + 1):
+
+        # get the indices of time in bin index bin_time
+        indices_time = np.argwhere(bin_time_ind == bin_time).flatten()
+
+        # select only those array values that are in the time bin being considered
+        temp_arr = arr[indices_time, :]
+
+        if is_lazy:
+            all_means.append(
+                dask.delayed(mean_temp_arr)(
+                    n_bin_er, temp_arr, digitized_echo_range[indices_time, :]
+                )
+            )
+        else:
+            all_means.append(
+                mean_temp_arr(n_bin_er, temp_arr, digitized_echo_range[indices_time, :])
+            )
+
+    if is_lazy:
+        all_means = dask.compute(all_means)[0]  # TODO: make this into persist in the future?
+
+    final_reduced = np.array(all_means)
+
+    return final_reduced
+
+
 def bin_and_mean_times(er_means, bins_time_i8, times_i8):
     """
 
@@ -421,6 +532,8 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
     # ping_interval = np.array(list(ds_Sv.ping_time.
     # resample(ping_time='20s', skipna=True).groups.keys()))
 
+    print("account for uneven echo range ")
+
     all_MVBS = []
     for chan in ds_Sv.channel:
 
@@ -431,28 +544,37 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
         # average should be done in linear domain
         sv = 10 ** (ds["Sv"] / 10)
 
-        # set 1D coordinate using the 1st ping echo_range since identical for all pings
-        # remove padded NaN entries if exist for all pings
-        er = (
-            ds["echo_range"]
-            .dropna(dim="range_sample", how="all")
-            .dropna(dim="ping_time")
-            .isel(ping_time=0)
-        )
+        # # set 1D coordinate using the 1st ping echo_range since identical for all pings
+        # # remove padded NaN entries if exist for all pings
+        # er = (
+        #     ds["echo_range"]
+        #     .dropna(dim="range_sample", how="all")
+        #     .dropna(dim="ping_time")
+        #     .isel(ping_time=0)
+        # )
 
-        # use first ping er as indexer for sv
-        sv = sv.sel(range_sample=er.range_sample.values)
-        sv.coords["echo_range"] = (["range_sample"], er.values)
-        sv = sv.swap_dims({"range_sample": "echo_range"})
+        # # use first ping er as indexer for sv
+        # sv = sv.sel(range_sample=er.range_sample.values)
+        # sv.coords["echo_range"] = (["range_sample"], er.values)
+        # sv = sv.swap_dims({"range_sample": "echo_range"})
 
         # collect the MVBS calculated for the channel
+        # all_MVBS.append(
+        #     bin_and_mean_2d(
+        #         sv.data,
+        #         bins_time=ping_interval,
+        #         bins_er=range_interval,
+        #         times=sv.ping_time.data,
+        #         echo_range=sv.echo_range.data,
+        #     )
+        # )
         all_MVBS.append(
-            bin_and_mean_2d(
+            bin_and_mean_2d_v2(
                 sv.data,
                 bins_time=ping_interval,
                 bins_er=range_interval,
                 times=sv.ping_time.data,
-                echo_range=sv.echo_range.data,
+                echo_range=ds["echo_range"].data,
             )
         )
 
diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index e2876a6d7..4e4c64d20 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -542,21 +542,24 @@ def create_known_mean_data(create_dask, num_pings_in_bin, num_er_in_bin):
         for count, bin_val in enumerate(er_bins):
 
             if create_dask:
-                a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[count], ping_bin))
+                a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_er_in_bin[count],))
             else:
-                a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[count], ping_bin))
+                a = np.random.uniform(bin_val[0], bin_val[1], (num_er_in_bin[count],))
 
             temp.append(a)
 
         means.append([arr.mean() for arr in temp])
         arrays.append(np.concatenate(temp, axis=0))
 
+    echo_range = np.linspace(0, er_csum.max(), num=er_csum.max() + 1, dtype=np.int64)
+
     if create_dask:
         means = dask.array.from_array(means).compute()
+        echo_range = dask.array.from_array(echo_range)
 
     full_array = np.concatenate(arrays, axis=1)
 
-    return means, full_array, ping_bins, er_bins
+    return means, full_array, ping_bins, er_bins, echo_range
 
 
 def test_bin_and_mean_2d():
@@ -567,7 +570,7 @@ def test_bin_and_mean_2d():
     num_pings_in_bin = np.random.randint(low=10, high=1000, size=10)
     num_er_in_bin = np.random.randint(low=10, high=1000, size=10)
 
-    means, full_array, ping_bins, er_bins = create_known_mean_data(create_dask,
+    means, full_array, ping_bins, er_bins, echo_range = create_known_mean_data(create_dask,
                                                                    num_pings_in_bin, num_er_in_bin)
 
     ping_times = np.concatenate([np.random.uniform(bin_val[0], bin_val[1], num_pings_in_bin[count])
@@ -576,7 +579,7 @@ def test_bin_and_mean_2d():
     digitize_ping_bin = [*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:]]
     digitize_er_bin = [*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]
 
-    calc_means = bin_and_mean_2d(full_array, digitize_ping_bin, digitize_er_bin, ping_times)
+    calc_means = bin_and_mean_2d(full_array, digitize_ping_bin, digitize_er_bin, ping_times, echo_range)
 
     if create_dask:
         calc_means = calc_means.compute()

From a79dae459660e0895504aefb83bce4f0ef0376cc Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Fri, 4 Nov 2022 11:07:06 -0700
Subject: [PATCH 09/28] replace previous bin and mean method with general
 method, construct mock sv values, and test general method against mock values

---
 echopype/preprocess/api.py                   | 131 +++----------------
 echopype/tests/preprocess/test_preprocess.py | 115 +++++++++++-----
 2 files changed, 99 insertions(+), 147 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index e84e8ae69..fb4b17072 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -334,56 +334,6 @@ def regrid():
     return 1
 
 
-def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range):
-    """
-    Bins and means a 2D array
-
-
-    bins_ping, bins_er are bins in np.digitize format
-
-    TODO: document!
-
-    Notes
-    -----
-    This function assumes that values in ``arr`` are not outside
-    of the bins presecribed by ``bins_er``. Additionally, it also
-    assumes that values in ``pings`` are not outside the bins given
-    by bins_ping.
-    """
-
-    # TODO: put in check to make sure arr is 2D
-
-    # get bin index for each echo_range value
-    if isinstance(echo_range, dask.array.Array):
-        bin_er_ind = np.digitize(echo_range.compute(), bins_er, right=False)
-    else:
-        bin_er_ind = np.digitize(echo_range, bins_er, right=False)
-
-    n_bin_er = len(bins_er)
-
-    binned_means = []
-    for bin_er in range(1, n_bin_er):
-
-        er_selected_data = np.nanmean(arr[:, bin_er_ind == bin_er], axis=1)
-
-        binned_means.append(er_selected_data)
-
-    er_means = np.vstack(binned_means).compute()
-
-    # turn datetime into integers, so we can use np.digitize
-    if isinstance(times, dask.array.Array):
-        times_i8 = times.compute().data.view("i8")
-    else:
-        times_i8 = times.view("i8")
-
-    # turn datetime into integers, so we can use np.digitize
-    bins_time_i8 = bins_time.view("i8")
-
-    final = bin_and_mean_times(er_means, bins_time_i8, times_i8)
-
-    return final
-
-
 def get_bin_indices(
     echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray
 ) -> Tuple[np.ndarray, np.ndarray]:
@@ -429,15 +379,16 @@ def get_bin_indices(
 
 
 def mean_temp_arr(n_bin_er, temp_arr, dig_er_subset):
+    # TODO: document
 
     means = []
     for bin_er in range(1, n_bin_er):
-        means.append(10 * np.log10(np.nanmean(temp_arr[dig_er_subset == bin_er], axis=0)))
+        means.append(np.nanmean(temp_arr[dig_er_subset == bin_er], axis=0))
 
     return means
 
 
-def bin_and_mean_2d_v2(arr, bins_time, bins_er, times, echo_range):
+def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range):
     """
 
 
@@ -451,6 +402,8 @@ def bin_and_mean_2d_v2(arr, bins_time, bins_er, times, echo_range):
     ping time and columns corresponding to echo range.
     """
 
+    # TODO: document
+
     # determine if array to bin is lazy or not
     is_lazy = False
     if isinstance(arr, dask.array.Array):
@@ -493,38 +446,6 @@ def bin_and_mean_2d_v2(arr, bins_time, bins_er, times, echo_range):
     return final_reduced
 
 
-def bin_and_mean_times(er_means, bins_time_i8, times_i8):
-    """
-
-
-    Parameters
-    ----------
-    er_means
-    bins_time
-    times
-
-    Returns
-    -------
-
-    Notes
-    -----
-    This function assumes that ``times`` corresponds to the
-    columns of ``er_means``.
-    """
-
-    # get bin index for each time
-    bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False)
-
-    n_bin_time = len(bins_time_i8)
-
-    final = np.empty((n_bin_time, er_means.shape[0]))
-    for bin_time in range(1, n_bin_time + 1):
-        indices = np.argwhere(bin_time_ind == bin_time).flatten()
-        final[bin_time - 1, :] = 10 * np.log10(np.nanmean(er_means[:, indices], axis=1))
-
-    return final
-
-
 def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
 
     # range_meter_bin = 5
@@ -544,38 +465,16 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
         # average should be done in linear domain
         sv = 10 ** (ds["Sv"] / 10)
 
-        # # set 1D coordinate using the 1st ping echo_range since identical for all pings
-        # # remove padded NaN entries if exist for all pings
-        # er = (
-        #     ds["echo_range"]
-        #     .dropna(dim="range_sample", how="all")
-        #     .dropna(dim="ping_time")
-        #     .isel(ping_time=0)
-        # )
-
-        # # use first ping er as indexer for sv
-        # sv = sv.sel(range_sample=er.range_sample.values)
-        # sv.coords["echo_range"] = (["range_sample"], er.values)
-        # sv = sv.swap_dims({"range_sample": "echo_range"})
-
-        # collect the MVBS calculated for the channel
-        # all_MVBS.append(
-        #     bin_and_mean_2d(
-        #         sv.data,
-        #         bins_time=ping_interval,
-        #         bins_er=range_interval,
-        #         times=sv.ping_time.data,
-        #         echo_range=sv.echo_range.data,
-        #     )
-        # )
-        all_MVBS.append(
-            bin_and_mean_2d_v2(
-                sv.data,
-                bins_time=ping_interval,
-                bins_er=range_interval,
-                times=sv.ping_time.data,
-                echo_range=ds["echo_range"].data,
-            )
+        # get MVBS for channel in linear domain
+        chan_MVBS = bin_and_mean_2d(
+            sv.data,
+            bins_time=ping_interval,
+            bins_er=range_interval,
+            times=sv.ping_time.data,
+            echo_range=ds["echo_range"].data,
         )
 
+        # apply inverse mapping to get back to the original domain and store values
+        all_MVBS.append(10 * np.log10(chan_MVBS))
+
     return np.stack(all_MVBS, axis=0)
diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index 4e4c64d20..1b9f5dd85 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -525,64 +525,117 @@ def create_bins(csum_array):
     return bins
 
 
-def create_known_mean_data(create_dask, num_pings_in_bin, num_er_in_bin):
-    # TODO: document!
+def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range):
+
+    # TODO: document and split up function into parts
+
+    num_pings_in_bin = np.random.randint(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins)
 
+    # create bins for ping_time dimension
     ping_csum = np.cumsum(num_pings_in_bin)
     ping_bins = create_bins(ping_csum)
 
+    # create bins for echo_range dimension
+    num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins)
     er_csum = np.cumsum(num_er_in_bin)
     er_bins = create_bins(er_csum)
 
-    arrays = []
-    means = []
-    for ping_bin in num_pings_in_bin:
+    # build echo_range array
+    final_arrays = []
+    all_er_bin_nums = []
+    ping_times_in_bin = []
+    for ping_ind, ping_bin in enumerate(ping_bins):
+
+        ping_times_in_bin.append(np.random.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],)))
+
+        # randomly determine the number of values in each echo_range bin
+        num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins)
 
-        temp = []
+        # store the number of values in each echo_range bin
+        all_er_bin_nums.append(num_er_in_bin)
+
+        er_row_block = []
         for count, bin_val in enumerate(er_bins):
+            # create a block of echo_range values
+            a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind],
+                                                           num_er_in_bin[count]))
+
+            # store the block of echo_range values
+            er_row_block.append(a)
+
+        # collect and construct echo_range row block
+        final_arrays.append(np.concatenate(er_row_block, axis=1))
 
-            if create_dask:
-                a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_er_in_bin[count],))
-            else:
-                a = np.random.uniform(bin_val[0], bin_val[1], (num_er_in_bin[count],))
+    # get final ping_time dimension
+    final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]')
 
-            temp.append(a)
+    # get maximum number of echo_range elements amongst all times
+    max_num_er_elem = max([arr.shape[1] for arr in final_arrays])
 
-        means.append([arr.mean() for arr in temp])
-        arrays.append(np.concatenate(temp, axis=0))
+    # total number of ping times
+    tot_num_times = ping_csum[-1]
 
-    echo_range = np.linspace(0, er_csum.max(), num=er_csum.max() + 1, dtype=np.int64)
+    # pad echo_range dimension with nans and create final echo_range
+    final_er = np.empty((tot_num_times, max_num_er_elem))
+    final_er[:] = np.nan
+    for count, arr in enumerate(final_arrays):
 
-    if create_dask:
-        means = dask.array.from_array(means).compute()
-        echo_range = dask.array.from_array(echo_range)
+        if count == 0:
+            final_er[0:ping_csum[count], 0:arr.shape[1]] = arr
+        else:
+            final_er[ping_csum[count - 1]:ping_csum[count], 0:arr.shape[1]] = arr
 
-    full_array = np.concatenate(arrays, axis=1)
+    # pad echo_range dimension with nans and create final sv
+    final_sv = np.empty((tot_num_times, max_num_er_elem))
+    final_sv[:] = np.nan
 
-    return means, full_array, ping_bins, er_bins, echo_range
+    final_means = []
+    for count, arr in enumerate(all_er_bin_nums):
 
+        # create sv row values using natural numbers
+        sv_row_list = [np.arange(1, num_elem + 1, 1) for num_elem in arr]
+
+        # create final sv row
+        sv_row = np.concatenate(sv_row_list)
+
+        # get final mean which is n+1/2 (since we are using natural numbers)
+        final_means.append([(len(elem) + 1) / 2.0 for elem in sv_row_list])
+
+        # create sv row block
+        sv_row_block = np.tile(sv_row, (num_pings_in_bin[count], 1))
+
+        if count == 0:
+            final_sv[0:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block
+        else:
+            final_sv[ping_csum[count - 1]:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block
+
+    # create final sv MVBS
+    final_MVBS = np.vstack(final_means)
+
+    return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time
 
 def test_bin_and_mean_2d():
+    # TODO: create lazy option for sv and echo_range
+
     # TODO: document and create a fixture with input of create_dask
 
     create_dask = True
 
-    num_pings_in_bin = np.random.randint(low=10, high=1000, size=10)
-    num_er_in_bin = np.random.randint(low=10, high=1000, size=10)
+    final_num_ping_bins = 2
+    final_num_er_bins = 5
 
-    means, full_array, ping_bins, er_bins, echo_range = create_known_mean_data(create_dask,
-                                                                   num_pings_in_bin, num_er_in_bin)
+    ping_range = [1, 5]
+    er_range = [1, 5]
 
-    ping_times = np.concatenate([np.random.uniform(bin_val[0], bin_val[1], num_pings_in_bin[count])
-                                 for count, bin_val in enumerate(ping_bins)])
+    known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range)
 
-    digitize_ping_bin = [*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:]]
-    digitize_er_bin = [*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]
+    digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]])
+    digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]])
+    digitize_ping_bin = digitize_ping_bin.astype('datetime64[ns]')
 
-    calc_means = bin_and_mean_2d(full_array, digitize_ping_bin, digitize_er_bin, ping_times, echo_range)
+    calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin,
+                                bins_er=digitize_er_bin, times=final_ping_time, echo_range=final_er)
 
-    if create_dask:
-        calc_means = calc_means.compute()
+    assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10)
 
-    assert np.allclose(calc_means, means, atol=1e-10, rtol=1e-10)
 

From 2a7629aa6f3706b7968f43c75e262416af671c4e Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Fri, 4 Nov 2022 14:57:24 -0700
Subject: [PATCH 10/28] document and clean up code associated with creating the
 mock data set for MVBS testing

---
 echopype/tests/preprocess/test_preprocess.py | 226 +++++++++++++++++--
 1 file changed, 202 insertions(+), 24 deletions(-)

diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index 1b9f5dd85..9a7109a62 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -6,6 +6,7 @@
 import dask.array
 
 from echopype.preprocess.api import bin_and_mean_2d
+from typing import List, Tuple, Iterable
 
 
 @pytest.fixture(
@@ -512,40 +513,80 @@ def test_preprocess_mvbs(test_data_samples):
     assert ep.preprocess.compute_MVBS(Sv) is not None
 
 
-def create_bins(csum_array):
-    # TODO: document!
+def create_bins(csum_array: np.ndarray) -> Iterable:
+    """
+    Constructs bin ranges based off of a cumulative
+    sum array.
+
+    Parameters
+    ----------
+    csum_array: np.ndarray
+        1D array representing a cumulative sum
+
+    Returns
+    -------
+    bins: list
+        A list whose elements are the lower and upper bin ranges
+    """
+
     bins = []
+
+    # construct bins
     for count, csum in enumerate(csum_array):
 
         if count == 0:
-            bins.append((0, csum))
+
+            bins.append([0, csum])
+
         else:
-            # add 0.01 so that left bins don't overlap
-            bins.append((csum_array[count-1] + 0.01, csum))
-    return bins
 
+            # add 0.01 so that left bins don't overlap
+            bins.append([csum_array[count-1] + 0.01, csum])
 
-def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range):
+    return bins
 
-    # TODO: document and split up function into parts
 
-    num_pings_in_bin = np.random.randint(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins)
+def create_echo_range_related_data(ping_bins: Iterable,
+                                   num_pings_in_bin: np.ndarray,
+                                   er_range: list, er_bins: Iterable,
+                                   final_num_er_bins: int) -> Tuple[list, list, list]:
+    """
+    Creates ``echo_range`` values and associated bin information.
 
-    # create bins for ping_time dimension
-    ping_csum = np.cumsum(num_pings_in_bin)
-    ping_bins = create_bins(ping_csum)
+    Parameters
+    ----------
+    ping_bins: list
+        A list whose elements are the lower and upper ping time bin ranges
+    num_pings_in_bin: np.ndarray
+        Specifies the number of pings in each ping time bin
+    er_range: list
+        A list whose first element is the lowest and second element is
+        the highest possible number of echo range values in a given bin
+    er_bins:  list
+        A list whose elements are the lower and upper echo range bin ranges
+    final_num_er_bins: int
+        The total number of echo range bins
 
-    # create bins for echo_range dimension
-    num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins)
-    er_csum = np.cumsum(num_er_in_bin)
-    er_bins = create_bins(er_csum)
+    Returns
+    -------
+    all_er_bin_nums: list of np.ndarray
+        A list whose elements are the number of values in each echo_range
+        bin, for each ping bin
+    ping_times_in_bin: list of np.ndarray
+        A list whose elements are the ping_time values for each corresponding bin
+    final_arrays: list of np.ndarray
+        A list whose elements are the echo_range values for a given ping and
+        echo range bin block
+    """
 
-    # build echo_range array
     final_arrays = []
     all_er_bin_nums = []
     ping_times_in_bin = []
+
+    # build echo_range array
     for ping_ind, ping_bin in enumerate(ping_bins):
 
+        # create the ping times associated with each ping bin
         ping_times_in_bin.append(np.random.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],)))
 
         # randomly determine the number of values in each echo_range bin
@@ -566,8 +607,30 @@ def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, e
         # collect and construct echo_range row block
         final_arrays.append(np.concatenate(er_row_block, axis=1))
 
-    # get final ping_time dimension
-    final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]')
+    return all_er_bin_nums, ping_times_in_bin, final_arrays
+
+
+def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray],
+                                  ping_csum: np.ndarray) -> Tuple[np.ndarray, int]:
+    """
+    Creates the final 2D ``echo_range`` array with appropriate padding.
+
+    Parameters
+    ----------
+    final_arrays: list of np.ndarray
+        A list whose elements are the echo_range values for a given ping and
+        echo range bin block
+    ping_csum: np.ndarray
+        1D array representing the cumulative sum for the number of ping times
+        in each ping bin
+
+    Returns
+    -------
+    final_er: np.ndarray
+        The final 2D ``echo_range`` array
+    max_num_er_elem: int
+        The maximum number of ``echo_range`` elements amongst all times
+    """
 
     # get maximum number of echo_range elements amongst all times
     max_num_er_elem = max([arr.shape[1] for arr in final_arrays])
@@ -585,6 +648,39 @@ def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, e
         else:
             final_er[ping_csum[count - 1]:ping_csum[count], 0:arr.shape[1]] = arr
 
+    return final_er, max_num_er_elem
+
+
+def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
+                          all_er_bin_nums: Iterable[np.ndarray],
+                          num_pings_in_bin: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Creates the final 2D Sv array with appropriate padding.
+
+    Parameters
+    ----------
+    max_num_er_elem: int
+        The maximum number of ``echo_range`` elements amongst all times
+    ping_csum: np.ndarray
+        1D array representing the cumulative sum for the number of ping times
+        in each ping bin
+    all_er_bin_nums: list of np.ndarray
+        A list whose elements are the number of values in each echo_range
+        bin, for each ping bin
+    num_pings_in_bin: np.ndarray
+        Specifies the number of pings in each ping time bin
+
+    Returns
+    -------
+    final_sv: np.ndarray
+        The final 2D Sv array
+    final_MVBS: np.ndarray
+        The final 2D known MVBS array
+    """
+
+    # total number of ping times
+    tot_num_times = ping_csum[-1]
+
     # pad echo_range dimension with nans and create final sv
     final_sv = np.empty((tot_num_times, max_num_er_elem))
     final_sv[:] = np.nan
@@ -612,9 +708,85 @@ def create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, e
     # create final sv MVBS
     final_MVBS = np.vstack(final_means)
 
+    return final_sv, final_MVBS
+
+
+def create_known_mean_data(final_num_ping_bins: int,
+                           final_num_er_bins: int,
+                           ping_range: list,
+                           er_range: list) -> Tuple[np.ndarray, np.ndarray, Iterable,
+                                                    Iterable, np.ndarray, np.ndarray]:
+    """
+    Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays
+    where the MVBS is known.
+
+    Parameters
+    ----------
+    final_num_ping_bins: int
+        The total number of ping time bins
+    final_num_er_bins: int
+        The total number of echo range bins
+    ping_range: list
+        A list whose first element is the lowest and second element is
+        the highest possible number of ping time values in a given bin
+    er_range: list
+        A list whose first element is the lowest and second element is
+        the highest possible number of echo range values in a given bin
+
+    Returns
+    -------
+    final_MVBS: np.ndarray
+        The final 2D known MVBS array
+    final_sv: np.ndarray
+        The final 2D Sv array
+    ping_bins: Iterable
+        A list whose elements are the lower and upper ping time bin ranges
+    er_bins: Iterable
+        A list whose elements are the lower and upper echo range bin ranges
+    final_er: np.ndarray
+        The final 2D ``echo_range`` array
+    final_ping_time: np.ndarray
+        The final 1D ``ping_time`` array
+    """
+
+    # randomly generate the number of pings in each ping bin
+    num_pings_in_bin = np.random.randint(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins)
+
+    # create bins for ping_time dimension
+    ping_csum = np.cumsum(num_pings_in_bin)
+    ping_bins = create_bins(ping_csum)
+
+    # create bins for echo_range dimension
+    num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins)
+    er_csum = np.cumsum(num_er_in_bin)
+    er_bins = create_bins(er_csum)
+
+    # create the echo_range data and associated bin information
+    all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin,
+                                                                                         er_range, er_bins,
+                                                                                         final_num_er_bins)
+
+    # create the final echo_range array using created data and padding
+    final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum)
+
+    # get final ping_time dimension
+    final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]')
+
+    # create the final sv array
+    final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, all_er_bin_nums, num_pings_in_bin)
+
     return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time
 
-def test_bin_and_mean_2d():
+
+def test_bin_and_mean_2d() -> None:
+    """
+    Tests the function ``bin_and_mean_2d``, which is the core
+    method for ``compute_MVBS``. This is done by creating mock
+    data (which can have varying number of ``echo_range`` values
+    for each ``ping_time``) with known means.
+    """
+
+
     # TODO: create lazy option for sv and echo_range
 
     # TODO: document and create a fixture with input of create_dask
@@ -627,15 +799,21 @@ def test_bin_and_mean_2d():
     ping_range = [1, 5]
     er_range = [1, 5]
 
-    known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, final_num_er_bins, ping_range, er_range)
+    # create echo_range, ping_time, and Sv arrays where the MVBS is known
+    known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins,
+                                                                                                 final_num_er_bins,
+                                                                                                 ping_range, er_range)
 
+    # put the created ping bins into a form that works with bin_and_mean_2d
     digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]])
-    digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]])
     digitize_ping_bin = digitize_ping_bin.astype('datetime64[ns]')
 
+    # put the created echo range bins into a form that works with bin_and_mean_2d
+    digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]])
+
+    # calculate MVBS for mock data set
     calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin,
                                 bins_er=digitize_er_bin, times=final_ping_time, echo_range=final_er)
 
+    # compare known MVBS solution against its calculated counterpart
     assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10)
-
-

From 86a14914cfab071b6eb1a063efe437bfbae002cf Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Fri, 4 Nov 2022 15:26:25 -0700
Subject: [PATCH 11/28] allow Sv and echo_range arrays to be dask arrays in
 mock data

---
 echopype/tests/preprocess/test_preprocess.py | 76 ++++++++++++++------
 1 file changed, 54 insertions(+), 22 deletions(-)

diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index 9a7109a62..fc468801d 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -6,7 +6,7 @@
 import dask.array
 
 from echopype.preprocess.api import bin_and_mean_2d
-from typing import List, Tuple, Iterable
+from typing import List, Tuple, Iterable, Union
 
 
 @pytest.fixture(
@@ -549,7 +549,8 @@ def create_bins(csum_array: np.ndarray) -> Iterable:
 def create_echo_range_related_data(ping_bins: Iterable,
                                    num_pings_in_bin: np.ndarray,
                                    er_range: list, er_bins: Iterable,
-                                   final_num_er_bins: int) -> Tuple[list, list, list]:
+                                   final_num_er_bins: int,
+                                   create_dask: bool) -> Tuple[list, list, list]:
     """
     Creates ``echo_range`` values and associated bin information.
 
@@ -566,6 +567,9 @@ def create_echo_range_related_data(ping_bins: Iterable,
         A list whose elements are the lower and upper echo range bin ranges
     final_num_er_bins: int
         The total number of echo range bins
+    create_dask: bool
+        If True ``final_arrays`` values will be
+        dask arrays, else they will be numpy arrays
 
     Returns
     -------
@@ -574,7 +578,7 @@ def create_echo_range_related_data(ping_bins: Iterable,
         bin, for each ping bin
     ping_times_in_bin: list of np.ndarray
         A list whose elements are the ping_time values for each corresponding bin
-    final_arrays: list of np.ndarray
+    final_arrays: list of np.ndarray or dask.array.Array
         A list whose elements are the echo_range values for a given ping and
         echo range bin block
     """
@@ -597,9 +601,14 @@ def create_echo_range_related_data(ping_bins: Iterable,
 
         er_row_block = []
         for count, bin_val in enumerate(er_bins):
+
             # create a block of echo_range values
-            a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind],
-                                                           num_er_in_bin[count]))
+            if create_dask:
+                a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind],
+                                                                       num_er_in_bin[count]))
+            else:
+                a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind],
+                                                               num_er_in_bin[count]))
 
             # store the block of echo_range values
             er_row_block.append(a)
@@ -611,7 +620,8 @@ def create_echo_range_related_data(ping_bins: Iterable,
 
 
 def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray],
-                                  ping_csum: np.ndarray) -> Tuple[np.ndarray, int]:
+                                  ping_csum: np.ndarray,
+                                  create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array], int]:
     """
     Creates the final 2D ``echo_range`` array with appropriate padding.
 
@@ -623,10 +633,12 @@ def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray],
     ping_csum: np.ndarray
         1D array representing the cumulative sum for the number of ping times
         in each ping bin
+    create_dask: bool
+        If True ``final_er`` will be a dask array, else it will be a numpy array
 
     Returns
     -------
-    final_er: np.ndarray
+    final_er: np.ndarray or dask.array.Array
         The final 2D ``echo_range`` array
     max_num_er_elem: int
         The maximum number of ``echo_range`` elements amongst all times
@@ -639,8 +651,12 @@ def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray],
     tot_num_times = ping_csum[-1]
 
     # pad echo_range dimension with nans and create final echo_range
-    final_er = np.empty((tot_num_times, max_num_er_elem))
-    final_er[:] = np.nan
+    if create_dask:
+        final_er = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan
+    else:
+        final_er = np.empty((tot_num_times, max_num_er_elem))
+        final_er[:] = np.nan
+
     for count, arr in enumerate(final_arrays):
 
         if count == 0:
@@ -653,7 +669,9 @@ def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray],
 
 def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
                           all_er_bin_nums: Iterable[np.ndarray],
-                          num_pings_in_bin: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+                          num_pings_in_bin: np.ndarray,
+                          create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array],
+                                                      np.ndarray]:
     """
     Creates the final 2D Sv array with appropriate padding.
 
@@ -669,10 +687,12 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
         bin, for each ping bin
     num_pings_in_bin: np.ndarray
         Specifies the number of pings in each ping time bin
+    create_dask: bool
+        If True ``final_sv`` will be a dask array, else it will be a numpy array
 
     Returns
     -------
-    final_sv: np.ndarray
+    final_sv: np.ndarray or dask.array.Array
         The final 2D Sv array
     final_MVBS: np.ndarray
         The final 2D known MVBS array
@@ -682,8 +702,11 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
     tot_num_times = ping_csum[-1]
 
     # pad echo_range dimension with nans and create final sv
-    final_sv = np.empty((tot_num_times, max_num_er_elem))
-    final_sv[:] = np.nan
+    if create_dask:
+        final_sv = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan
+    else:
+        final_sv = np.empty((tot_num_times, max_num_er_elem))
+        final_sv[:] = np.nan
 
     final_means = []
     for count, arr in enumerate(all_er_bin_nums):
@@ -714,8 +737,8 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
 def create_known_mean_data(final_num_ping_bins: int,
                            final_num_er_bins: int,
                            ping_range: list,
-                           er_range: list) -> Tuple[np.ndarray, np.ndarray, Iterable,
-                                                    Iterable, np.ndarray, np.ndarray]:
+                           er_range: list, create_dask: bool) -> Tuple[np.ndarray, np.ndarray, Iterable,
+                                                                       Iterable, np.ndarray, np.ndarray]:
     """
     Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays
     where the MVBS is known.
@@ -732,6 +755,9 @@ def create_known_mean_data(final_num_ping_bins: int,
     er_range: list
         A list whose first element is the lowest and second element is
         the highest possible number of echo range values in a given bin
+    create_dask: bool
+        If True the ``Sv`` and ``echo_range`` values produced will be
+        dask arrays, else they will be numpy arrays.
 
     Returns
     -------
@@ -764,16 +790,18 @@ def create_known_mean_data(final_num_ping_bins: int,
     # create the echo_range data and associated bin information
     all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin,
                                                                                          er_range, er_bins,
-                                                                                         final_num_er_bins)
+                                                                                         final_num_er_bins,
+                                                                                         create_dask)
 
     # create the final echo_range array using created data and padding
-    final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum)
+    final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum, create_dask)
 
     # get final ping_time dimension
     final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]')
 
     # create the final sv array
-    final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, all_er_bin_nums, num_pings_in_bin)
+    final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum,
+                                                 all_er_bin_nums, num_pings_in_bin, create_dask)
 
     return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time
 
@@ -784,10 +812,13 @@ def test_bin_and_mean_2d() -> None:
     method for ``compute_MVBS``. This is done by creating mock
     data (which can have varying number of ``echo_range`` values
     for each ``ping_time``) with known means.
-    """
-
 
-    # TODO: create lazy option for sv and echo_range
+    Parameters
+    ----------
+    create_dask: bool
+        If True the ``Sv`` and ``echo_range`` values produced will be
+        dask arrays, else they will be numpy arrays.
+    """
 
     # TODO: document and create a fixture with input of create_dask
 
@@ -802,7 +833,8 @@ def test_bin_and_mean_2d() -> None:
     # create echo_range, ping_time, and Sv arrays where the MVBS is known
     known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins,
                                                                                                  final_num_er_bins,
-                                                                                                 ping_range, er_range)
+                                                                                                 ping_range, er_range,
+                                                                                                 create_dask)
 
     # put the created ping bins into a form that works with bin_and_mean_2d
     digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]])

From 49b374563e8c3f80e2c6d39dda7cd56f6f8b5425 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Fri, 4 Nov 2022 15:41:54 -0700
Subject: [PATCH 12/28] add pytest fixture to test_bin_and_mean_2d

---
 echopype/tests/preprocess/test_preprocess.py | 55 ++++++++++++++++----
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index fc468801d..7a6701795 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -6,7 +6,7 @@
 import dask.array
 
 from echopype.preprocess.api import bin_and_mean_2d
-from typing import List, Tuple, Iterable, Union
+from typing import Tuple, Iterable, Union
 
 
 @pytest.fixture(
@@ -806,7 +806,37 @@ def create_known_mean_data(final_num_ping_bins: int,
     return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time
 
 
-def test_bin_and_mean_2d() -> None:
+@pytest.fixture(
+    params=[
+        {
+            "create_dask": True,
+            "final_num_ping_bins": 10,
+            "final_num_er_bins": 10,
+            "ping_range": [10, 1000],
+            "er_range": [10, 1000]
+        },
+        {
+            "create_dask": False,
+            "final_num_ping_bins": 10,
+            "final_num_er_bins": 10,
+            "ping_range": [10, 1000],
+            "er_range": [10, 1000]
+        },
+    ],
+    ids=[
+        "delayed_data",
+        "in_memory_data"
+    ],
+)
+def bin_and_mean_2d_params(request):
+    """
+    Obtains all necessary parameters for ``test_bin_and_mean_2d``.
+    """
+
+    return list(request.param.values())
+
+
+def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None:
     """
     Tests the function ``bin_and_mean_2d``, which is the core
     method for ``compute_MVBS``. This is done by creating mock
@@ -818,17 +848,20 @@ def test_bin_and_mean_2d() -> None:
     create_dask: bool
         If True the ``Sv`` and ``echo_range`` values produced will be
         dask arrays, else they will be numpy arrays.
+    final_num_ping_bins: int
+        The total number of ping time bins
+    final_num_er_bins: int
+        The total number of echo range bins
+    ping_range: list
+        A list whose first element is the lowest and second element is
+        the highest possible number of ping time values in a given bin
+    er_range: list
+        A list whose first element is the lowest and second element is
+        the highest possible number of echo range values in a given bin
     """
 
-    # TODO: document and create a fixture with input of create_dask
-
-    create_dask = True
-
-    final_num_ping_bins = 2
-    final_num_er_bins = 5
-
-    ping_range = [1, 5]
-    er_range = [1, 5]
+    # get all parameters needed to create the mock data
+    create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params
 
     # create echo_range, ping_time, and Sv arrays where the MVBS is known
     known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins,

From ae12a6e8f5c7aacab460ef336bc78cf8d01ccc28 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Fri, 4 Nov 2022 16:32:10 -0700
Subject: [PATCH 13/28] begin documenting all functions needed to compute MVBS
 along a channel

---
 echopype/preprocess/api.py | 94 ++++++++++++++++++++++++++++++++------
 1 file changed, 81 insertions(+), 13 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index fb4b17072..233233da7 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -2,7 +2,7 @@
 Functions for enhancing the spatial and temporal coherence of data.
 """
 
-from typing import Tuple
+from typing import List, Tuple, Union
 
 import dask.array
 import numpy as np
@@ -378,8 +378,34 @@ def get_bin_indices(
     return digitized_echo_range, bin_time_ind
 
 
-def mean_temp_arr(n_bin_er, temp_arr, dig_er_subset):
-    # TODO: document
+def mean_temp_arr(
+    n_bin_er: int,
+    temp_arr: Union[dask.array.Array, np.ndarray],
+    dig_er_subset: Union[dask.array.Array, np.ndarray],
+) -> List[Union[dask.array.Array, np.ndarray]]:
+    """
+    Bins the data in ``temp_arr`` with respect to the
+    ``echo_range`` bin and means the resulting bin.
+
+    Parameters
+    ----------
+    n_bin_er: int
+        The number of echo range bins
+    temp_arr: dask.array.Array or np.ndarray
+        Array of Sv values at the ``ping_time`` bin index being considered
+    dig_er_subset: dask.array.Array or np.ndarray
+        Array representing the digitized (bin indices) for ``echo_range`` at
+        the ``ping_time`` bin index being considered
+
+    Returns
+    -------
+    means: list of dask.array.Array or np.ndarray
+
+    Notes
+    -----
+    It is necessary for this to be a function because we may need to
+    delay it.
+    """
 
     means = []
     for bin_er in range(1, n_bin_er):
@@ -388,29 +414,50 @@ def mean_temp_arr(n_bin_er, temp_arr, dig_er_subset):
     return means
 
 
-def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range):
+def bin_and_mean_2d(
+    arr: Union[dask.array.Array, np.ndarray],
+    bins_time: np.ndarray,
+    bins_er: np.ndarray,
+    times: np.ndarray,
+    echo_range: np.ndarray,
+) -> np.ndarray:
     """
+    Bins and means ``arr`` based on ``times`` and ``echo_range``,
+    and their corresponding bins. If ``arr`` is ``Sv`` then this
+    will compute the MVBS.
 
+    Parameters
+    ----------
+    arr: dask.array.Array or np.ndarray
+        The 2D array whose values should be binned
+    bins_time: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``times``
+    bins_er: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``echo_range``
+    times: np.ndarray
+        1D array corresponding to the time values that should be binned
+    echo_range: np.ndarray
+        2D array of echo range values
 
     Returns
     -------
-
+    final_reduced: np.ndarray
+        The final binned and meaned ``arr``, if ``arr`` is ``Sv`` then this is the MVBS
 
     Notes
     -----
     This function assumes that ``arr`` has rows corresponding to
-    ping time and columns corresponding to echo range.
-    """
+    ``ping_time`` and columns corresponding to ``echo_range``.
 
-    # TODO: document
+    This function allows the number of ``echo_range`` values to
+    vary amongst ``ping_times``.
+    """
 
     # determine if array to bin is lazy or not
     is_lazy = False
     if isinstance(arr, dask.array.Array):
         is_lazy = True
 
-    print(f"is_lazy = {is_lazy}")
-
     # get the number of echo range and time bins
     n_bin_er = len(bins_er)
     n_bin_time = len(bins_time)
@@ -427,6 +474,7 @@ def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range):
         # select only those array values that are in the time bin being considered
         temp_arr = arr[indices_time, :]
 
+        # bin and mean with respect to echo_range bins
         if is_lazy:
             all_means.append(
                 dask.delayed(mean_temp_arr)(
@@ -439,22 +487,41 @@ def bin_and_mean_2d(arr, bins_time, bins_er, times, echo_range):
             )
 
     if is_lazy:
+        # compute all constructs means
         all_means = dask.compute(all_means)[0]  # TODO: make this into persist in the future?
 
+    # construct final reduced form of arr
     final_reduced = np.array(all_means)
 
     return final_reduced
 
 
-def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
+def get_MVBS_along_channels(
+    ds_Sv: xr.Dataset, range_interval: np.ndarray, ping_interval: np.ndarray
+):
+    """
+    Computes the MVBS of ``ds_SV`` along each channel for the given
+    intervals.
+
+    Parameters
+    ----------
+    ds_Sv: xr.Dataset
+
+    range_interval: np.ndarray
+
+    ping_interval: np.ndarray
+
+
+    Returns
+    -------
+
+    """
 
     # range_meter_bin = 5
     # range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin)
     # ping_interval = np.array(list(ds_Sv.ping_time.
     # resample(ping_time='20s', skipna=True).groups.keys()))
 
-    print("account for uneven echo range ")
-
     all_MVBS = []
     for chan in ds_Sv.channel:
 
@@ -477,4 +544,5 @@ def get_MVBS_along_channels(ds_Sv, range_interval, ping_interval):
         # apply inverse mapping to get back to the original domain and store values
         all_MVBS.append(10 * np.log10(chan_MVBS))
 
+    # collect the MVBS values for each channel
     return np.stack(all_MVBS, axis=0)

From 2a4afcd2c1e5540c0a7e9fee2e579f136f1dff0f Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Mon, 7 Nov 2022 08:45:41 -0800
Subject: [PATCH 14/28] finish documenting get_MVBS_along_channels function

---
 echopype/preprocess/api.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 233233da7..c61b4c1ad 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -442,7 +442,7 @@ def bin_and_mean_2d(
     Returns
     -------
     final_reduced: np.ndarray
-        The final binned and meaned ``arr``, if ``arr`` is ``Sv`` then this is the MVBS
+        The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS
 
     Notes
     -----
@@ -497,31 +497,36 @@ def bin_and_mean_2d(
 
 
 def get_MVBS_along_channels(
-    ds_Sv: xr.Dataset, range_interval: np.ndarray, ping_interval: np.ndarray
-):
+    ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray
+) -> np.ndarray:
     """
-    Computes the MVBS of ``ds_SV`` along each channel for the given
+    Computes the MVBS of ``ds_Sv`` along each channel for the given
     intervals.
 
     Parameters
     ----------
     ds_Sv: xr.Dataset
-
-    range_interval: np.ndarray
-
+        A Dataset containing ``Sv`` and ``echo_range`` data with coordinates
+        ``channel``, ``ping_time``, and ``range_sample``
+    echo_range_interval: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``echo_range``
     ping_interval: np.ndarray
-
+        1D array (used by np.digitize) representing the binning required for ``ping_time``
 
     Returns
     -------
+    np.ndarray
+        The MVBS value of the input ``ds_Sv`` for all channels
 
+    Notes
+    -----
+    If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with
+    respect to ``echo_range`` will take place, then the binning and mean with respect to
+    ``ping_time`` will be a delayed operation, and lastly the delayed values will be
+    computed. It is necessary to apply a compute at the end of this method because Dask
+    graph layers get too large and this makes downstream operations very inefficient.
     """
 
-    # range_meter_bin = 5
-    # range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin)
-    # ping_interval = np.array(list(ds_Sv.ping_time.
-    # resample(ping_time='20s', skipna=True).groups.keys()))
-
     all_MVBS = []
     for chan in ds_Sv.channel:
 
@@ -536,7 +541,7 @@ def get_MVBS_along_channels(
         chan_MVBS = bin_and_mean_2d(
             sv.data,
             bins_time=ping_interval,
-            bins_er=range_interval,
+            bins_er=echo_range_interval,
             times=sv.ping_time.data,
             echo_range=ds["echo_range"].data,
         )

From d55ef170c0c0fc2efecf32b8b444b437892c3e38 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Mon, 7 Nov 2022 17:17:52 -0800
Subject: [PATCH 15/28] incorporate new bin and mean method into compute_MVBS
 and go back to old way of assuming echo_range is the same throughout
 ping_time

---
 echopype/preprocess/__init__.py |   3 +-
 echopype/preprocess/api.py      | 207 +++++++++++++++++++++++++++-----
 2 files changed, 180 insertions(+), 30 deletions(-)

diff --git a/echopype/preprocess/__init__.py b/echopype/preprocess/__init__.py
index c6f9606d5..1f7576bfd 100644
--- a/echopype/preprocess/__init__.py
+++ b/echopype/preprocess/__init__.py
@@ -1,7 +1,8 @@
-from .api import compute_MVBS, compute_MVBS_index_binning, remove_noise
+from .api import compute_MVBS, compute_MVBS_index_binning, compute_MVBS_v2, remove_noise
 
 __all__ = [
     "compute_MVBS",
+    "compute_MVBS_v2",  # TODO: remove this!
     "compute_MVBS_index_binning",
     "remove_noise",
 ]
diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index c61b4c1ad..8e195ae19 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -192,6 +192,122 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"):
     return ds_MVBS
 
 
+def compute_MVBS_v2(ds_Sv, range_meter_bin=20, ping_time_bin="20S"):
+    """
+    Compute Mean Volume Backscattering Strength (MVBS)
+    based on intervals of range (``echo_range``) and ``ping_time`` specified in physical units.
+
+    Output of this function differs from that of ``compute_MVBS_index_binning``, which computes
+    bin-averaged Sv according to intervals of ``echo_range`` and ``ping_time`` specified as
+    index number.
+
+    Parameters
+    ----------
+    ds_Sv : xr.Dataset
+        dataset containing Sv and ``echo_range`` [m]
+    range_meter_bin : Union[int, float]
+        bin size along ``echo_range`` in meters, default to ``20``
+    ping_time_bin : str
+        bin size along ``ping_time``, default to ``20S``
+
+    Returns
+    -------
+    A dataset containing bin-averaged Sv
+    """
+
+    # TODO: remove this as it is no longer necessary
+    # if not ds_Sv["echo_range"].groupby("channel").apply(_check_range_uniqueness).all():
+    #     raise ValueError(
+    #         "echo_range variable changes across pings in at least one of the frequency channels."
+    #     )
+
+    # create bin information for echo_range
+    range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin)
+
+    # create bin information needed for ping_time
+    # ping_interval = np.array(list(ds_Sv.ping_time.resample(ping_time=ping_time_bin,
+    # skipna=True).groups.keys()))
+
+    ping_interval = (
+        ds_Sv.ping_time.resample(ping_time=ping_time_bin, skipna=True).asfreq().ping_time.values
+    )
+
+    # calculate the MVBS along each channel
+    MVBS_values = get_MVBS_along_channels(ds_Sv, range_interval, ping_interval)
+
+    # create MVBS dataset
+    ds_MVBS = xr.Dataset(
+        data_vars={"Sv": (["channel", "ping_time", "echo_range"], MVBS_values)},
+        coords={
+            "ping_time": ping_interval,
+            "channel": ds_Sv.channel,
+            "echo_range": range_interval[:-1],
+        },
+    )
+
+    # Added this check to support the test in test_process.py::test_compute_MVBS
+    if "filenames" in ds_MVBS.variables:
+        ds_MVBS = ds_MVBS.drop_vars("filenames")
+
+    # ping_time_bin parsing and conversions
+    # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings
+    # https://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html
+    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.resample.html
+    # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.html
+    # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
+    # https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units
+    timedelta_units = {
+        "d": {"nptd64": "D", "unitstr": "day"},
+        "h": {"nptd64": "h", "unitstr": "hour"},
+        "t": {"nptd64": "m", "unitstr": "minute"},
+        "min": {"nptd64": "m", "unitstr": "minute"},
+        "s": {"nptd64": "s", "unitstr": "second"},
+        "l": {"nptd64": "ms", "unitstr": "millisecond"},
+        "ms": {"nptd64": "ms", "unitstr": "millisecond"},
+        "u": {"nptd64": "us", "unitstr": "microsecond"},
+        "us": {"nptd64": "ms", "unitstr": "millisecond"},
+        "n": {"nptd64": "ns", "unitstr": "nanosecond"},
+        "ns": {"nptd64": "ms", "unitstr": "millisecond"},
+    }
+    ping_time_bin_td = pd.Timedelta(ping_time_bin)
+    # res = resolution (most granular time unit)
+    ping_time_bin_resunit = ping_time_bin_td.resolution_string.lower()
+    ping_time_bin_resvalue = int(
+        ping_time_bin_td / np.timedelta64(1, timedelta_units[ping_time_bin_resunit]["nptd64"])
+    )
+    ping_time_bin_resunit_label = timedelta_units[ping_time_bin_resunit]["unitstr"]
+
+    # Attach attributes
+    _set_MVBS_attrs(ds_MVBS)
+    ds_MVBS["echo_range"].attrs = {"long_name": "Range distance", "units": "m"}
+    ds_MVBS["Sv"] = ds_MVBS["Sv"].assign_attrs(
+        {
+            "cell_methods": (
+                f"ping_time: mean (interval: {ping_time_bin_resvalue} {ping_time_bin_resunit_label} "  # noqa
+                "comment: ping_time is the interval start) "
+                f"echo_range: mean (interval: {range_meter_bin} meter "
+                "comment: echo_range is the interval start)"
+            ),
+            "binning_mode": "physical units",
+            "range_meter_interval": str(range_meter_bin) + "m",
+            "ping_time_interval": ping_time_bin,
+            "actual_range": [
+                round(float(ds_MVBS["Sv"].min().values), 2),
+                round(float(ds_MVBS["Sv"].max().values), 2),
+            ],
+        }
+    )
+
+    prov_dict = echopype_prov_attrs(process_type="processing")
+    prov_dict["processing_function"] = "preprocess.compute_MVBS"
+    ds_MVBS = ds_MVBS.assign_attrs(prov_dict)
+    ds_MVBS["frequency_nominal"] = ds_Sv["frequency_nominal"]  # re-attach frequency_nominal
+
+    return ds_MVBS
+    # return MVBS_values
+
+
 def compute_MVBS_index_binning(ds_Sv, range_sample_num=100, ping_num=100):
     """
     Compute Mean Volume Backscattering Strength (MVBS)
@@ -454,46 +570,67 @@ def bin_and_mean_2d(
     """
 
     # determine if array to bin is lazy or not
-    is_lazy = False
-    if isinstance(arr, dask.array.Array):
-        is_lazy = True
+    # is_lazy = False
+    # if isinstance(arr, dask.array.Array):
+    #     is_lazy = True
 
     # get the number of echo range and time bins
     n_bin_er = len(bins_er)
     n_bin_time = len(bins_time)
+    print(f"n_bin_time = {n_bin_time}")
 
     # obtain the bin indices for echo_range and times
     digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time)
 
-    all_means = []
-    for bin_time in range(1, n_bin_time + 1):
-
-        # get the indices of time in bin index bin_time
-        indices_time = np.argwhere(bin_time_ind == bin_time).flatten()
-
-        # select only those array values that are in the time bin being considered
-        temp_arr = arr[indices_time, :]
+    binned_means = []
+    for bin_er in range(1, n_bin_er):
+        er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
 
-        # bin and mean with respect to echo_range bins
-        if is_lazy:
-            all_means.append(
-                dask.delayed(mean_temp_arr)(
-                    n_bin_er, temp_arr, digitized_echo_range[indices_time, :]
-                )
-            )
-        else:
-            all_means.append(
-                mean_temp_arr(n_bin_er, temp_arr, digitized_echo_range[indices_time, :])
-            )
+        binned_means.append(er_selected_data)
 
-    if is_lazy:
-        # compute all constructs means
-        all_means = dask.compute(all_means)[0]  # TODO: make this into persist in the future?
+    er_means = np.vstack(binned_means).compute()
 
-    # construct final reduced form of arr
-    final_reduced = np.array(all_means)
+    final = np.empty((n_bin_time, n_bin_er - 1))
+    # for bin_time in range(1, n_bin_time + 1):
+    for bin_time in range(1, n_bin_time):
+        indices = np.argwhere(bin_time_ind == bin_time).flatten()
 
-    return final_reduced
+        if len(indices) == 0:
+            final[bin_time - 1, :] = np.nanmean(er_means[:, :], axis=1)  # TODO: look into this
+        else:
+            final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1)
+
+    return final
+
+    # all_means = []
+    # for bin_time in range(1, n_bin_time + 1):
+    #
+    #     # get the indices of time in bin index bin_time
+    #     indices_time = np.argwhere(bin_time_ind == bin_time).flatten()
+    #
+    #     # select only those array values that are in the time bin being considered
+    #     temp_arr = arr[indices_time, :]
+    #
+    #     # bin and mean with respect to echo_range bins
+    #     if is_lazy:
+    #         all_means.append(
+    #             dask.delayed(mean_temp_arr)(
+    #                 n_bin_er, temp_arr, digitized_echo_range[indices_time, :]
+    #             )
+    #         )
+    #     else:
+    #         all_means.append(
+    #             mean_temp_arr(n_bin_er, temp_arr, digitized_echo_range[indices_time, :])
+    #         )
+
+    # if is_lazy:
+    #     # compute all constructs means
+    #     all_means = dask.compute(all_means)[0]  # TODO: make this into persist in the future?
+    #
+    # # construct final reduced form of arr
+    # final_reduced = np.array(all_means)
+    #
+    # return final_reduced
 
 
 def get_MVBS_along_channels(
@@ -534,6 +671,14 @@ def get_MVBS_along_channels(
         # TODO: not sure why not already removed for the AZFP case. Investigate.
         ds = ds_Sv.sel(channel=chan).squeeze()
 
+        echo_range = (
+            ds["echo_range"]
+            .dropna(dim="range_sample", how="all")
+            .dropna(dim="ping_time")
+            .isel(ping_time=0)
+            .values
+        )
+
         # average should be done in linear domain
         sv = 10 ** (ds["Sv"] / 10)
 
@@ -543,9 +688,13 @@ def get_MVBS_along_channels(
             bins_time=ping_interval,
             bins_er=echo_range_interval,
             times=sv.ping_time.data,
-            echo_range=ds["echo_range"].data,
+            echo_range=echo_range,  # ds["echo_range"].data,
         )
 
+        # all_MVBS.append(chan_MVBS)
+        # all_MVBS.extend(chan_MVBS)
+        # return all_MVBS
+
         # apply inverse mapping to get back to the original domain and store values
         all_MVBS.append(10 * np.log10(chan_MVBS))
 

From cc02dadcc4bc0fb32e709ca3d3ae8470ac904857 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Tue, 8 Nov 2022 09:22:15 -0800
Subject: [PATCH 16/28] make the MVBS output values of new method be equivalent
 to the old method by changing the bins produced by the resample method

---
 echopype/preprocess/api.py | 117 +++++++++----------------------------
 1 file changed, 26 insertions(+), 91 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 8e195ae19..ec33959e9 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -2,7 +2,7 @@
 Functions for enhancing the spatial and temporal coherence of data.
 """
 
-from typing import List, Tuple, Union
+from typing import Tuple, Union
 
 import dask.array
 import numpy as np
@@ -225,9 +225,6 @@ def compute_MVBS_v2(ds_Sv, range_meter_bin=20, ping_time_bin="20S"):
     range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin)
 
     # create bin information needed for ping_time
-    # ping_interval = np.array(list(ds_Sv.ping_time.resample(ping_time=ping_time_bin,
-    # skipna=True).groups.keys()))
-
     ping_interval = (
         ds_Sv.ping_time.resample(ping_time=ping_time_bin, skipna=True).asfreq().ping_time.values
     )
@@ -305,7 +302,6 @@ def compute_MVBS_v2(ds_Sv, range_meter_bin=20, ping_time_bin="20S"):
     ds_MVBS["frequency_nominal"] = ds_Sv["frequency_nominal"]  # re-attach frequency_nominal
 
     return ds_MVBS
-    # return MVBS_values
 
 
 def compute_MVBS_index_binning(ds_Sv, range_sample_num=100, ping_num=100):
@@ -494,42 +490,6 @@ def get_bin_indices(
     return digitized_echo_range, bin_time_ind
 
 
-def mean_temp_arr(
-    n_bin_er: int,
-    temp_arr: Union[dask.array.Array, np.ndarray],
-    dig_er_subset: Union[dask.array.Array, np.ndarray],
-) -> List[Union[dask.array.Array, np.ndarray]]:
-    """
-    Bins the data in ``temp_arr`` with respect to the
-    ``echo_range`` bin and means the resulting bin.
-
-    Parameters
-    ----------
-    n_bin_er: int
-        The number of echo range bins
-    temp_arr: dask.array.Array or np.ndarray
-        Array of Sv values at the ``ping_time`` bin index being considered
-    dig_er_subset: dask.array.Array or np.ndarray
-        Array representing the digitized (bin indices) for ``echo_range`` at
-        the ``ping_time`` bin index being considered
-
-    Returns
-    -------
-    means: list of dask.array.Array or np.ndarray
-
-    Notes
-    -----
-    It is necessary for this to be a function because we may need to
-    delay it.
-    """
-
-    means = []
-    for bin_er in range(1, n_bin_er):
-        means.append(np.nanmean(temp_arr[dig_er_subset == bin_er], axis=0))
-
-    return means
-
-
 def bin_and_mean_2d(
     arr: Union[dask.array.Array, np.ndarray],
     bins_time: np.ndarray,
@@ -565,73 +525,52 @@ def bin_and_mean_2d(
     This function assumes that ``arr`` has rows corresponding to
     ``ping_time`` and columns corresponding to ``echo_range``.
 
-    This function allows the number of ``echo_range`` values to
+    This function should not be run if the number of ``echo_range`` values
     vary amongst ``ping_times``.
     """
 
-    # determine if array to bin is lazy or not
-    # is_lazy = False
-    # if isinstance(arr, dask.array.Array):
-    #     is_lazy = True
-
     # get the number of echo range and time bins
     n_bin_er = len(bins_er)
     n_bin_time = len(bins_time)
-    print(f"n_bin_time = {n_bin_time}")
 
     # obtain the bin indices for echo_range and times
     digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time)
 
     binned_means = []
     for bin_er in range(1, n_bin_er):
+
+        # bin and mean echo_range dimension
         er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
 
+        # collect all echo_range bins
         binned_means.append(er_selected_data)
 
-    er_means = np.vstack(binned_means).compute()
+    # create full echo_range binned array
+    er_means = np.vstack(binned_means)
+
+    # if er_means is a dask array we compute it so the graph does not get too large
+    if isinstance(er_means, dask.array.Array):
+        er_means = er_means.compute()
 
+    # create final reduced array
     final = np.empty((n_bin_time, n_bin_er - 1))
-    # for bin_time in range(1, n_bin_time + 1):
-    for bin_time in range(1, n_bin_time):
+
+    for bin_time in range(1, n_bin_time + 1):
+
+        # obtain er_mean indices corresponding to the time bin
         indices = np.argwhere(bin_time_ind == bin_time).flatten()
 
         if len(indices) == 0:
-            final[bin_time - 1, :] = np.nanmean(er_means[:, :], axis=1)  # TODO: look into this
+
+            # fill values with NaN, if there are no values in the bin
+            final[bin_time - 1, :] = np.nan
         else:
+
+            # bin and mean the er_mean time bin
             final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1)
 
     return final
 
-    # all_means = []
-    # for bin_time in range(1, n_bin_time + 1):
-    #
-    #     # get the indices of time in bin index bin_time
-    #     indices_time = np.argwhere(bin_time_ind == bin_time).flatten()
-    #
-    #     # select only those array values that are in the time bin being considered
-    #     temp_arr = arr[indices_time, :]
-    #
-    #     # bin and mean with respect to echo_range bins
-    #     if is_lazy:
-    #         all_means.append(
-    #             dask.delayed(mean_temp_arr)(
-    #                 n_bin_er, temp_arr, digitized_echo_range[indices_time, :]
-    #             )
-    #         )
-    #     else:
-    #         all_means.append(
-    #             mean_temp_arr(n_bin_er, temp_arr, digitized_echo_range[indices_time, :])
-    #         )
-
-    # if is_lazy:
-    #     # compute all constructs means
-    #     all_means = dask.compute(all_means)[0]  # TODO: make this into persist in the future?
-    #
-    # # construct final reduced form of arr
-    # final_reduced = np.array(all_means)
-    #
-    # return final_reduced
-
 
 def get_MVBS_along_channels(
     ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray
@@ -658,10 +597,10 @@ def get_MVBS_along_channels(
     Notes
     -----
     If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with
-    respect to ``echo_range`` will take place, then the binning and mean with respect to
-    ``ping_time`` will be a delayed operation, and lastly the delayed values will be
-    computed. It is necessary to apply a compute at the end of this method because Dask
-    graph layers get too large and this makes downstream operations very inefficient.
+    respect to ``echo_range`` will take place, then the delayed result will be computed,
+    and lastly the binning and mean with respect to ``ping_time`` will be completed. It
+    is necessary to apply a compute midway through this method because Dask graph layers
+    get too large and this makes downstream operations very inefficient.
     """
 
     all_MVBS = []
@@ -688,13 +627,9 @@ def get_MVBS_along_channels(
             bins_time=ping_interval,
             bins_er=echo_range_interval,
             times=sv.ping_time.data,
-            echo_range=echo_range,  # ds["echo_range"].data,
+            echo_range=echo_range,
         )
 
-        # all_MVBS.append(chan_MVBS)
-        # all_MVBS.extend(chan_MVBS)
-        # return all_MVBS
-
         # apply inverse mapping to get back to the original domain and store values
         all_MVBS.append(10 * np.log10(chan_MVBS))
 

From 7dadbf47f091934a276f2adc4a152bda9f95761f Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Tue, 8 Nov 2022 11:27:24 -0800
Subject: [PATCH 17/28] add code section that groups different echo_range
 values, computes them, and then constructs the appropriate er_means array

---
 echopype/preprocess/api.py | 79 ++++++++++++++++++++++++++++++--------
 1 file changed, 62 insertions(+), 17 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index ec33959e9..5f6eab54a 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -490,6 +490,24 @@ def get_bin_indices(
     return digitized_echo_range, bin_time_ind
 
 
+def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er):
+
+    # TODO: document!
+
+    binned_means = []
+    for bin_er in range(1, n_bin_er):
+        # bin and mean echo_range dimension
+        er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
+
+        # collect all echo_range bins
+        binned_means.append(er_selected_data)
+
+    # create full echo_range binned array
+    er_means = np.vstack(binned_means)
+
+    return er_means
+
+
 def bin_and_mean_2d(
     arr: Union[dask.array.Array, np.ndarray],
     bins_time: np.ndarray,
@@ -536,17 +554,52 @@ def bin_and_mean_2d(
     # obtain the bin indices for echo_range and times
     digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time)
 
-    binned_means = []
-    for bin_er in range(1, n_bin_er):
+    # TODO: clean up the below code!
 
-        # bin and mean echo_range dimension
-        er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
+    # compute bin indices to allow for downstream processes (mainly axis argument in unique)
+    if isinstance(digitized_echo_range, dask.array.Array):
+        digitized_echo_range = digitized_echo_range.compute()
 
-        # collect all echo_range bins
-        binned_means.append(er_selected_data)
+    unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True)
 
-    # create full echo_range binned array
-    er_means = np.vstack(binned_means)
+    # TODO: put the below into its own function
+    if unique_er_bin_ind.shape[0] != 1:
+
+        print("grouping necessary")
+        print(unique_er_bin_ind.shape[0])
+        print(unique_er_bin_ind)
+        grps_same_ind = [
+            np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse)
+        ]
+
+        binned_er = []  # the values appended may not be in the correct order
+        for count, grp in enumerate(grps_same_ind):
+            binned_er.append(
+                bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er)
+            )
+
+        # put the columns in the correct order
+        binned_er_array = np.hstack(binned_er)
+        correct_column_ind = np.argsort(np.concatenate(grps_same_ind))
+        er_means = binned_er_array[:, correct_column_ind]
+
+    else:
+        print("no grouping necessary")
+        # make sure this correctly works
+        er_means = bin_and_mean_echo_range(arr, unique_er_bin_ind[0, :], n_bin_er)
+
+    # binned_means = []
+    # for bin_er in range(1, n_bin_er):
+    #
+    #     # bin and mean echo_range dimension
+    #     er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
+    #
+    #     # collect all echo_range bins
+    #     binned_means.append(er_selected_data)
+    #
+    # # create full echo_range binned array
+    # er_means = np.vstack(binned_means)
+    #
 
     # if er_means is a dask array we compute it so the graph does not get too large
     if isinstance(er_means, dask.array.Array):
@@ -610,14 +663,6 @@ def get_MVBS_along_channels(
         # TODO: not sure why not already removed for the AZFP case. Investigate.
         ds = ds_Sv.sel(channel=chan).squeeze()
 
-        echo_range = (
-            ds["echo_range"]
-            .dropna(dim="range_sample", how="all")
-            .dropna(dim="ping_time")
-            .isel(ping_time=0)
-            .values
-        )
-
         # average should be done in linear domain
         sv = 10 ** (ds["Sv"] / 10)
 
@@ -627,7 +672,7 @@ def get_MVBS_along_channels(
             bins_time=ping_interval,
             bins_er=echo_range_interval,
             times=sv.ping_time.data,
-            echo_range=echo_range,
+            echo_range=ds["echo_range"],
         )
 
         # apply inverse mapping to get back to the original domain and store values

From 42d18a85cc02b84d19869029734566ebb9fb4d5f Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Tue, 8 Nov 2022 16:31:11 -0800
Subject: [PATCH 18/28] add two functions (less and more comprehensive) that
 determine if grouping echo_range with respect to ping_time is needed

---
 echopype/preprocess/api.py                   | 126 +++++++++++++++----
 echopype/tests/preprocess/test_preprocess.py |   2 +
 2 files changed, 104 insertions(+), 24 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 5f6eab54a..0cb7befcd 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -508,12 +508,96 @@ def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er):
     return er_means
 
 
+def get_unequal_rows(mat, row):
+    # compare row against all rows in mat (allowing for NaNs to be equal)
+    element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row))
+
+    # get those row indices that are not equal to row
+    row_not_equal = np.argwhere(np.logical_not(np.all(element_nan_equal, axis=1))).flatten()
+
+    return row_not_equal
+
+
+def is_grouping_needed_comprehensive(er_chan):
+
+    if isinstance(er_chan, xr.DataArray):
+        er_chan = er_chan.values
+
+    # grab the first ping_time that is not filled with NaNs
+    ping_index = 0
+    while np.all(np.isnan(er_chan[ping_index, :])):
+        ping_index += 1
+
+    unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :])
+
+    if len(unequal_ping_ind) > 0:
+
+        # see if all unequal_ping_ind are filled with NaNs
+        all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1))
+
+        if all_nans:
+            # All echo_range values have the same step size
+            return False
+        else:
+            # Some echo_range values have different step sizes
+            return True
+    else:
+        # All echo_range values have the same step size
+        return False
+
+
+def is_grouping_needed_less_comprehensive(er_chan):
+    """
+
+
+    Parameters
+    ----------
+    er_chan
+
+    Notes
+    -----
+    Although this method is slightly faster than ``is_grouping_needed_comprehensive``,
+    it is possible that this method will incorrectly determine if grouping
+    is necessary.
+    """
+
+    # determine the number of NaNs in each ping and find the unique number of NaNs
+    unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1))
+
+    if isinstance(unique_num_nans, dask.array.Array):
+        unique_num_nans = unique_num_nans.compute()
+
+    # determine if any value is not 0 or er_chan.shape[1]
+    unexpected_num_nans = False in np.logical_or(
+        unique_num_nans == 0, unique_num_nans == er_chan.shape[1]
+    )
+
+    if unexpected_num_nans:
+        # echo_range varies with ping_time
+        return True
+    else:
+
+        # make sure that the final echo_range value for each ping_time is the same (account for NaN)
+        num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum()
+
+        if isinstance(num_non_nans, dask.array.Array):
+            num_non_nans = num_non_nans.compute()
+
+        if num_non_nans > 1:
+            # echo_range varies with ping_time
+            return True
+        else:
+            # echo_range does not vary with ping_time
+            return False
+
+
 def bin_and_mean_2d(
     arr: Union[dask.array.Array, np.ndarray],
     bins_time: np.ndarray,
     bins_er: np.ndarray,
     times: np.ndarray,
     echo_range: np.ndarray,
+    comp_er_check: bool = True,
 ) -> np.ndarray:
     """
     Bins and means ``arr`` based on ``times`` and ``echo_range``,
@@ -532,6 +616,9 @@ def bin_and_mean_2d(
         1D array corresponding to the time values that should be binned
     echo_range: np.ndarray
         2D array of echo range values
+    comp_er_check: bool
+        If True, a more comprehensive check will be completed to determine if ``echo_range``
+        grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done
 
     Returns
     -------
@@ -556,18 +643,23 @@ def bin_and_mean_2d(
 
     # TODO: clean up the below code!
 
-    # compute bin indices to allow for downstream processes (mainly axis argument in unique)
-    if isinstance(digitized_echo_range, dask.array.Array):
-        digitized_echo_range = digitized_echo_range.compute()
-
-    unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True)
+    if comp_er_check:
+        grouping_needed = is_grouping_needed_comprehensive(echo_range)
+    else:
+        grouping_needed = is_grouping_needed_less_comprehensive(echo_range)
 
-    # TODO: put the below into its own function
-    if unique_er_bin_ind.shape[0] != 1:
+        # TODO: put the below into its own function
+    if grouping_needed:
 
         print("grouping necessary")
-        print(unique_er_bin_ind.shape[0])
-        print(unique_er_bin_ind)
+
+        # compute bin indices to allow for downstream processes (mainly axis argument in unique)
+        if isinstance(digitized_echo_range, dask.array.Array):
+            digitized_echo_range = digitized_echo_range.compute()
+
+        unique_er_bin_ind, unique_inverse = np.unique(
+            digitized_echo_range, axis=0, return_inverse=True
+        )
         grps_same_ind = [
             np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse)
         ]
@@ -585,21 +677,7 @@ def bin_and_mean_2d(
 
     else:
         print("no grouping necessary")
-        # make sure this correctly works
-        er_means = bin_and_mean_echo_range(arr, unique_er_bin_ind[0, :], n_bin_er)
-
-    # binned_means = []
-    # for bin_er in range(1, n_bin_er):
-    #
-    #     # bin and mean echo_range dimension
-    #     er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
-    #
-    #     # collect all echo_range bins
-    #     binned_means.append(er_selected_data)
-    #
-    # # create full echo_range binned array
-    # er_means = np.vstack(binned_means)
-    #
+        er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er)
 
     # if er_means is a dask array we compute it so the graph does not get too large
     if isinstance(er_means, dask.array.Array):
diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index 7a6701795..87f9c7adf 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -860,6 +860,8 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None:
         the highest possible number of echo range values in a given bin
     """
 
+    # TODO: add a seed here and print it so we can debug if necessary
+
     # get all parameters needed to create the mock data
     create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params
 

From e666950adbb56839babcf5c31a61d1a33d655687 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Tue, 8 Nov 2022 16:49:52 -0800
Subject: [PATCH 19/28] modify get_unequal_rows to account for dask input and
 add bool input to bin_and_mean_2d

---
 echopype/preprocess/api.py                   | 11 +++++++++--
 echopype/tests/preprocess/test_preprocess.py |  3 ++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 0cb7befcd..b6f018f59 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -512,10 +512,16 @@ def get_unequal_rows(mat, row):
     # compare row against all rows in mat (allowing for NaNs to be equal)
     element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row))
 
+    # determine if mat row is equal to row
+    row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1))
+
+    if isinstance(row_not_equal, dask.array.Array):
+        row_not_equal = row_not_equal.compute()
+
     # get those row indices that are not equal to row
-    row_not_equal = np.argwhere(np.logical_not(np.all(element_nan_equal, axis=1))).flatten()
+    row_ind_not_equal = np.argwhere(row_not_equal).flatten()
 
-    return row_not_equal
+    return row_ind_not_equal
 
 
 def is_grouping_needed_comprehensive(er_chan):
@@ -751,6 +757,7 @@ def get_MVBS_along_channels(
             bins_er=echo_range_interval,
             times=sv.ping_time.data,
             echo_range=ds["echo_range"],
+            comp_er_check=True,
         )
 
         # apply inverse mapping to get back to the original domain and store values
diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index 87f9c7adf..a4a8b03ee 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -880,7 +880,8 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None:
 
     # calculate MVBS for mock data set
     calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin,
-                                bins_er=digitize_er_bin, times=final_ping_time, echo_range=final_er)
+                                bins_er=digitize_er_bin, times=final_ping_time,
+                                echo_range=final_er, comp_er_check=True)
 
     # compare known MVBS solution against its calculated counterpart
     assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10)

From cc8770cb402dd5963904b90559ea5b118510a835 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Wed, 9 Nov 2022 09:16:10 -0800
Subject: [PATCH 20/28] add seed to test_bin_and_mean_2d and randomly choose a
 ping bin to be completely NaN

---
 echopype/preprocess/api.py                   |  3 -
 echopype/tests/preprocess/test_preprocess.py | 81 +++++++++++++++-----
 2 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index b6f018f59..20e7fc3c6 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -657,8 +657,6 @@ def bin_and_mean_2d(
         # TODO: put the below into its own function
     if grouping_needed:
 
-        print("grouping necessary")
-
         # compute bin indices to allow for downstream processes (mainly axis argument in unique)
         if isinstance(digitized_echo_range, dask.array.Array):
             digitized_echo_range = digitized_echo_range.compute()
@@ -682,7 +680,6 @@ def bin_and_mean_2d(
         er_means = binned_er_array[:, correct_column_ind]
 
     else:
-        print("no grouping necessary")
         er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er)
 
     # if er_means is a dask array we compute it so the graph does not get too large
diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index a4a8b03ee..40eb058ca 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -4,6 +4,7 @@
 import echopype as ep
 import pytest
 import dask.array
+from numpy.random import default_rng
 
 from echopype.preprocess.api import bin_and_mean_2d
 from typing import Tuple, Iterable, Union
@@ -550,7 +551,9 @@ def create_echo_range_related_data(ping_bins: Iterable,
                                    num_pings_in_bin: np.ndarray,
                                    er_range: list, er_bins: Iterable,
                                    final_num_er_bins: int,
-                                   create_dask: bool) -> Tuple[list, list, list]:
+                                   create_dask: bool,
+                                   rng: np.random.Generator,
+                                   ping_bin_nan_ind: int) -> Tuple[list, list, list]:
     """
     Creates ``echo_range`` values and associated bin information.
 
@@ -570,6 +573,10 @@ def create_echo_range_related_data(ping_bins: Iterable,
     create_dask: bool
         If True ``final_arrays`` values will be
         dask arrays, else they will be numpy arrays
+    rng: np.random.Generator
+        The generator for random values
+    ping_bin_nan_ind: int
+        The ping bin index to fill with NaNs
 
     Returns
     -------
@@ -591,10 +598,10 @@ def create_echo_range_related_data(ping_bins: Iterable,
     for ping_ind, ping_bin in enumerate(ping_bins):
 
         # create the ping times associated with each ping bin
-        ping_times_in_bin.append(np.random.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],)))
+        ping_times_in_bin.append(rng.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],)))
 
         # randomly determine the number of values in each echo_range bin
-        num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins)
+        num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins)
 
         # store the number of values in each echo_range bin
         all_er_bin_nums.append(num_er_in_bin)
@@ -607,12 +614,16 @@ def create_echo_range_related_data(ping_bins: Iterable,
                 a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind],
                                                                        num_er_in_bin[count]))
             else:
-                a = np.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind],
-                                                               num_er_in_bin[count]))
+                a = rng.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind],
+                                                         num_er_in_bin[count]))
 
             # store the block of echo_range values
             er_row_block.append(a)
 
+            # set all echo_range values at ping index to NaN
+            if ping_ind == ping_bin_nan_ind:
+                a[:, :] = np.nan
+
         # collect and construct echo_range row block
         final_arrays.append(np.concatenate(er_row_block, axis=1))
 
@@ -670,8 +681,9 @@ def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray],
 def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
                           all_er_bin_nums: Iterable[np.ndarray],
                           num_pings_in_bin: np.ndarray,
-                          create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array],
-                                                      np.ndarray]:
+                          create_dask: bool,
+                          ping_bin_nan_ind: int) -> Tuple[Union[np.ndarray, dask.array.Array],
+                                                          np.ndarray]:
     """
     Creates the final 2D Sv array with appropriate padding.
 
@@ -689,6 +701,8 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
         Specifies the number of pings in each ping time bin
     create_dask: bool
         If True ``final_sv`` will be a dask array, else it will be a numpy array
+    ping_bin_nan_ind: int
+        The ping bin index to fill with NaNs
 
     Returns
     -------
@@ -712,17 +726,26 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
     for count, arr in enumerate(all_er_bin_nums):
 
         # create sv row values using natural numbers
-        sv_row_list = [np.arange(1, num_elem + 1, 1) for num_elem in arr]
+        sv_row_list = [np.arange(1, num_elem + 1, 1, dtype=np.float64) for num_elem in arr]
 
         # create final sv row
         sv_row = np.concatenate(sv_row_list)
 
         # get final mean which is n+1/2 (since we are using natural numbers)
-        final_means.append([(len(elem) + 1) / 2.0 for elem in sv_row_list])
+        ping_mean = [(len(elem) + 1) / 2.0 for elem in sv_row_list]
 
         # create sv row block
         sv_row_block = np.tile(sv_row, (num_pings_in_bin[count], 1))
 
+        if count == ping_bin_nan_ind:
+
+            # fill values with NaNs
+            ping_mean = [np.nan]*len(sv_row_list)
+            sv_row_block[:, :] = np.nan
+
+        # store means for ping
+        final_means.append(ping_mean)
+
         if count == 0:
             final_sv[0:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block
         else:
@@ -737,8 +760,9 @@ def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray,
 def create_known_mean_data(final_num_ping_bins: int,
                            final_num_er_bins: int,
                            ping_range: list,
-                           er_range: list, create_dask: bool) -> Tuple[np.ndarray, np.ndarray, Iterable,
-                                                                       Iterable, np.ndarray, np.ndarray]:
+                           er_range: list, create_dask: bool,
+                           rng: np.random.Generator) -> Tuple[np.ndarray, np.ndarray, Iterable,
+                                                              Iterable, np.ndarray, np.ndarray]:
     """
     Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays
     where the MVBS is known.
@@ -758,6 +782,8 @@ def create_known_mean_data(final_num_ping_bins: int,
     create_dask: bool
         If True the ``Sv`` and ``echo_range`` values produced will be
         dask arrays, else they will be numpy arrays.
+    rng: np.random.Generator
+        generator for random integers
 
     Returns
     -------
@@ -776,22 +802,27 @@ def create_known_mean_data(final_num_ping_bins: int,
     """
 
     # randomly generate the number of pings in each ping bin
-    num_pings_in_bin = np.random.randint(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins)
+    num_pings_in_bin = rng.integers(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins)
 
     # create bins for ping_time dimension
     ping_csum = np.cumsum(num_pings_in_bin)
     ping_bins = create_bins(ping_csum)
 
     # create bins for echo_range dimension
-    num_er_in_bin = np.random.randint(low=er_range[0], high=er_range[1], size=final_num_er_bins)
+    num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins)
     er_csum = np.cumsum(num_er_in_bin)
     er_bins = create_bins(er_csum)
 
+    # randomly select one ping bin to fill with NaNs
+    ping_bin_nan_ind = rng.choice(len(ping_bins))
+
     # create the echo_range data and associated bin information
     all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin,
                                                                                          er_range, er_bins,
                                                                                          final_num_er_bins,
-                                                                                         create_dask)
+                                                                                         create_dask,
+                                                                                         rng,
+                                                                                         ping_bin_nan_ind)
 
     # create the final echo_range array using created data and padding
     final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum, create_dask)
@@ -801,7 +832,8 @@ def create_known_mean_data(final_num_ping_bins: int,
 
     # create the final sv array
     final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum,
-                                                 all_er_bin_nums, num_pings_in_bin, create_dask)
+                                                 all_er_bin_nums, num_pings_in_bin,
+                                                 create_dask, ping_bin_nan_ind)
 
     return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time
 
@@ -860,16 +892,27 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None:
         the highest possible number of echo range values in a given bin
     """
 
-    # TODO: add a seed here and print it so we can debug if necessary
-
     # get all parameters needed to create the mock data
     create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params
 
+    # randomly generate a seed
+    seed = np.random.randint(low=10, high=100000, size=1)[0]
+
+    print(f"seed used to generate mock data: {seed}")
+
+    # establish generator for random integers
+    rng = default_rng(seed=seed)
+
+    # seed dask random generator
+    if create_dask:
+        dask.array.random.seed(seed=seed)
+
     # create echo_range, ping_time, and Sv arrays where the MVBS is known
     known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins,
                                                                                                  final_num_er_bins,
                                                                                                  ping_range, er_range,
-                                                                                                 create_dask)
+                                                                                                 create_dask,
+                                                                                                 rng)
 
     # put the created ping bins into a form that works with bin_and_mean_2d
     digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]])
@@ -884,4 +927,4 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None:
                                 echo_range=final_er, comp_er_check=True)
 
     # compare known MVBS solution against its calculated counterpart
-    assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10)
+    assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True)

From 12686adc12cefadff6edb22342bc4614929ff506 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Wed, 9 Nov 2022 09:24:11 -0800
Subject: [PATCH 21/28] remove old functions associated with compute_MVBS and
 use the new compute_MVBS methods only

---
 echopype/preprocess/__init__.py |   3 +-
 echopype/preprocess/api.py      | 169 ++------------------------------
 2 files changed, 9 insertions(+), 163 deletions(-)

diff --git a/echopype/preprocess/__init__.py b/echopype/preprocess/__init__.py
index 1f7576bfd..c6f9606d5 100644
--- a/echopype/preprocess/__init__.py
+++ b/echopype/preprocess/__init__.py
@@ -1,8 +1,7 @@
-from .api import compute_MVBS, compute_MVBS_index_binning, compute_MVBS_v2, remove_noise
+from .api import compute_MVBS, compute_MVBS_index_binning, remove_noise
 
 __all__ = [
     "compute_MVBS",
-    "compute_MVBS_v2",  # TODO: remove this!
     "compute_MVBS_index_binning",
     "remove_noise",
 ]
diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 20e7fc3c6..58ef91fd7 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -13,63 +13,6 @@
 from .noise_est import NoiseEst
 
 
-def _check_range_uniqueness(da):
-    """
-    Check if range (``echo_range``) changes across ping in a given frequency channel.
-    """
-    # squeeze to remove "channel" dim if present
-    # TODO: not sure why not already removed for the AZFP case. Investigate.
-    da = da.squeeze()
-
-    # remove pings with NaN entries if exist
-    # since goal here is to check uniqueness
-    if np.unique(da.isnull(), axis=0).shape[0] != 1:
-        da = da.dropna(dim="ping_time", how="any")
-
-    # remove padded NaN entries if exist for all pings
-    da = da.dropna(dim="range_sample", how="all")
-
-    ping_time_idx = np.argwhere([dim == "ping_time" for dim in da.dims])[0][0]
-    if np.unique(da, axis=ping_time_idx).shape[ping_time_idx] == 1:
-        return xr.DataArray(data=True, coords={"channel": da["channel"].values})
-    else:
-        return xr.DataArray(data=False, coords={"channel": da["channel"].values})
-
-
-def _freq_MVBS(ds, rint, pbin):
-    # squeeze to remove "channel" dim if present
-    # TODO: not sure why not already removed for the AZFP case. Investigate.
-    ds = ds.squeeze()
-
-    # average should be done in linear domain
-    sv = 10 ** (ds["Sv"] / 10)
-
-    # set 1D coordinate using the 1st ping echo_range since identical for all pings
-    # remove padded NaN entries if exist for all pings
-    er = (
-        ds["echo_range"]
-        .dropna(dim="range_sample", how="all")
-        .dropna(dim="ping_time")
-        .isel(ping_time=0)
-    )
-
-    # use first ping er as indexer for sv
-    sv = sv.sel(range_sample=er.range_sample.values)
-    sv.coords["echo_range"] = (["range_sample"], er.values)
-    sv = sv.swap_dims({"range_sample": "echo_range"})
-    sv_groupby_bins = (
-        sv.resample(ping_time=pbin, skipna=True)
-        .mean(skipna=True)
-        .groupby_bins("echo_range", bins=rint, right=False, include_lowest=True)
-        .mean(skipna=True)
-    )
-    sv_groupby_bins.coords["echo_range"] = (["echo_range_bins"], rint[:-1])
-    sv_groupby_bins = sv_groupby_bins.swap_dims({"echo_range_bins": "echo_range"}).drop_vars(
-        "echo_range_bins"
-    )
-    return 10 * np.log10(sv_groupby_bins)
-
-
 def _set_MVBS_attrs(ds):
     """
     Attach common attributes.
@@ -118,109 +61,6 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"):
     A dataset containing bin-averaged Sv
     """
 
-    if not ds_Sv["echo_range"].groupby("channel").apply(_check_range_uniqueness).all():
-        raise ValueError(
-            "echo_range variable changes across pings in at least one of the frequency channels."
-        )
-
-    # Groupby freq in case of different echo_range (from different sampling intervals)
-    range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin)
-    ds_MVBS = (
-        ds_Sv.groupby("channel")
-        .apply(_freq_MVBS, args=(range_interval, ping_time_bin))
-        .to_dataset()
-    )
-    # Added this check to support the test in test_process.py::test_compute_MVBS
-    if "filenames" in ds_MVBS.variables:
-        ds_MVBS = ds_MVBS.drop_vars("filenames")
-
-    # ping_time_bin parsing and conversions
-    # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings
-    # https://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html
-    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.resample.html
-    # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.html
-    # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html
-    # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
-    # https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units
-    timedelta_units = {
-        "d": {"nptd64": "D", "unitstr": "day"},
-        "h": {"nptd64": "h", "unitstr": "hour"},
-        "t": {"nptd64": "m", "unitstr": "minute"},
-        "min": {"nptd64": "m", "unitstr": "minute"},
-        "s": {"nptd64": "s", "unitstr": "second"},
-        "l": {"nptd64": "ms", "unitstr": "millisecond"},
-        "ms": {"nptd64": "ms", "unitstr": "millisecond"},
-        "u": {"nptd64": "us", "unitstr": "microsecond"},
-        "us": {"nptd64": "ms", "unitstr": "millisecond"},
-        "n": {"nptd64": "ns", "unitstr": "nanosecond"},
-        "ns": {"nptd64": "ms", "unitstr": "millisecond"},
-    }
-    ping_time_bin_td = pd.Timedelta(ping_time_bin)
-    # res = resolution (most granular time unit)
-    ping_time_bin_resunit = ping_time_bin_td.resolution_string.lower()
-    ping_time_bin_resvalue = int(
-        ping_time_bin_td / np.timedelta64(1, timedelta_units[ping_time_bin_resunit]["nptd64"])
-    )
-    ping_time_bin_resunit_label = timedelta_units[ping_time_bin_resunit]["unitstr"]
-
-    # Attach attributes
-    _set_MVBS_attrs(ds_MVBS)
-    ds_MVBS["echo_range"].attrs = {"long_name": "Range distance", "units": "m"}
-    ds_MVBS["Sv"] = ds_MVBS["Sv"].assign_attrs(
-        {
-            "cell_methods": (
-                f"ping_time: mean (interval: {ping_time_bin_resvalue} {ping_time_bin_resunit_label} "  # noqa
-                "comment: ping_time is the interval start) "
-                f"echo_range: mean (interval: {range_meter_bin} meter "
-                "comment: echo_range is the interval start)"
-            ),
-            "binning_mode": "physical units",
-            "range_meter_interval": str(range_meter_bin) + "m",
-            "ping_time_interval": ping_time_bin,
-            "actual_range": [
-                round(float(ds_MVBS["Sv"].min().values), 2),
-                round(float(ds_MVBS["Sv"].max().values), 2),
-            ],
-        }
-    )
-
-    prov_dict = echopype_prov_attrs(process_type="processing")
-    prov_dict["processing_function"] = "preprocess.compute_MVBS"
-    ds_MVBS = ds_MVBS.assign_attrs(prov_dict)
-    ds_MVBS["frequency_nominal"] = ds_Sv["frequency_nominal"]  # re-attach frequency_nominal
-
-    return ds_MVBS
-
-
-def compute_MVBS_v2(ds_Sv, range_meter_bin=20, ping_time_bin="20S"):
-    """
-    Compute Mean Volume Backscattering Strength (MVBS)
-    based on intervals of range (``echo_range``) and ``ping_time`` specified in physical units.
-
-    Output of this function differs from that of ``compute_MVBS_index_binning``, which computes
-    bin-averaged Sv according to intervals of ``echo_range`` and ``ping_time`` specified as
-    index number.
-
-    Parameters
-    ----------
-    ds_Sv : xr.Dataset
-        dataset containing Sv and ``echo_range`` [m]
-    range_meter_bin : Union[int, float]
-        bin size along ``echo_range`` in meters, default to ``20``
-    ping_time_bin : str
-        bin size along ``ping_time``, default to ``20S``
-
-    Returns
-    -------
-    A dataset containing bin-averaged Sv
-    """
-
-    # TODO: remove this as it is no longer necessary
-    # if not ds_Sv["echo_range"].groupby("channel").apply(_check_range_uniqueness).all():
-    #     raise ValueError(
-    #         "echo_range variable changes across pings in at least one of the frequency channels."
-    #     )
-
     # create bin information for echo_range
     range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin)
 
@@ -509,6 +349,9 @@ def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er):
 
 
 def get_unequal_rows(mat, row):
+
+    # TODO: document!
+
     # compare row against all rows in mat (allowing for NaNs to be equal)
     element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row))
 
@@ -526,6 +369,8 @@ def get_unequal_rows(mat, row):
 
 def is_grouping_needed_comprehensive(er_chan):
 
+    # TODO: document!
+
     if isinstance(er_chan, xr.DataArray):
         er_chan = er_chan.values
 
@@ -567,6 +412,8 @@ def is_grouping_needed_less_comprehensive(er_chan):
     is necessary.
     """
 
+    # TODO: document!
+
     # determine the number of NaNs in each ping and find the unique number of NaNs
     unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1))
 
@@ -654,7 +501,7 @@ def bin_and_mean_2d(
     else:
         grouping_needed = is_grouping_needed_less_comprehensive(echo_range)
 
-        # TODO: put the below into its own function
+    # TODO: put the below into its own function
     if grouping_needed:
 
         # compute bin indices to allow for downstream processes (mainly axis argument in unique)

From 646f45c18db3267daff778ab8fab0a3db7753ad4 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Thu, 10 Nov 2022 15:55:36 -0800
Subject: [PATCH 22/28] comment and add docstrings to previously undocumented
 functions required for compute_MVBS

---
 echopype/preprocess/api.py | 96 ++++++++++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 13 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 58ef91fd7..1cb19ad5a 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -330,12 +330,30 @@ def get_bin_indices(
     return digitized_echo_range, bin_time_ind
 
 
-def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er):
+def bin_and_mean_echo_range(
+    arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int
+) -> Union[np.ndarray, dask.array.Array]:
+    """
+    Bins and means ``arr`` with respect to the ``echo_range`` bins.
 
-    # TODO: document!
+    Parameters
+    ----------
+    arr: np.ndarray or dask.array.Array
+        2D array to bin and mean
+    digitized_echo_range: np.ndarray
+        2D array of bin indices for ``echo_range``
+    n_bin_er: int
+        The number of echo range bins
+
+    Returns
+    -------
+    er_means: np.ndarray or dask.array.Array
+        2D array representing the bin and mean of ``arr`` along ``echo_range``
+    """
 
     binned_means = []
     for bin_er in range(1, n_bin_er):
+
         # bin and mean echo_range dimension
         er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
 
@@ -348,9 +366,29 @@ def bin_and_mean_echo_range(arr, digitized_echo_range, n_bin_er):
     return er_means
 
 
-def get_unequal_rows(mat, row):
+def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray:
+    """
+    Obtains those row indices of ``mat`` that are not equal
+    to ``row``.
+
+    Parameters
+    ----------
+    mat: np.ndarray
+        2D array with the same column dimension as the number
+        of elements in ``row``
+    row: np.ndarray
+        1D array with the same number of element elements as
+        the column dimension of ``mat``
 
-    # TODO: document!
+    Returns
+    -------
+    row_ind_not_equal: np.ndarray
+        The row indices of ``mat`` that are not equal to ``row``
+
+    Notes
+    -----
+    Elements with NaNs are considered equal if they are in the same position.
+    """
 
     # compare row against all rows in mat (allowing for NaNs to be equal)
     element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row))
@@ -367,10 +405,30 @@ def get_unequal_rows(mat, row):
     return row_ind_not_equal
 
 
-def is_grouping_needed_comprehensive(er_chan):
+def is_grouping_needed_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
+    """
+    A comprehensive check that determines if all ``echo_range`` values
+    along ``ping_time`` have the same step size. If they do not have
+    the same step sizes, then grouping of the ``echo_range`` values
+    will be necessary.
+
+    Parameters
+    ----------
+    er_chan: xr.DataArray or np.ndarray
+        2D array containing the ``echo_range`` values for each ``ping_time``
+
+    Returns
+    -------
+    bool
+        True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False
 
-    # TODO: document!
+    Notes
+    -----
+    ``er_chan`` should have rows corresponding to ``ping_time`` and columns
+    corresponding to ``range_sample``
+    """
 
+    # grab the in-memory numpy echo_range values, if necessary
     if isinstance(er_chan, xr.DataArray):
         er_chan = er_chan.values
 
@@ -379,6 +437,7 @@ def is_grouping_needed_comprehensive(er_chan):
     while np.all(np.isnan(er_chan[ping_index, :])):
         ping_index += 1
 
+    # determine those rows of er_chan that are not equal to the row ping_index
     unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :])
 
     if len(unequal_ping_ind) > 0:
@@ -397,26 +456,36 @@ def is_grouping_needed_comprehensive(er_chan):
         return False
 
 
-def is_grouping_needed_less_comprehensive(er_chan):
+def is_grouping_needed_less_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
     """
-
+    An alternative (less comprehensive) check that determines if all
+    ``echo_range`` values along ``ping_time`` have the same step size.
+    If they do not have the same step sizes, then grouping of the
+    ``echo_range`` values will be necessary.
 
     Parameters
     ----------
-    er_chan
+    er_chan: xr.DataArray or np.ndarray
+        2D array containing the ``echo_range`` values for each ``ping_time``
+
+    Returns
+    -------
+    bool
+        True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False
 
     Notes
     -----
-    Although this method is slightly faster than ``is_grouping_needed_comprehensive``,
-    it is possible that this method will incorrectly determine if grouping
+    It is possible that this method will incorrectly determine if grouping
     is necessary.
-    """
 
-    # TODO: document!
+    ``er_chan`` should have rows corresponding to ``ping_time`` and columns
+    corresponding to ``range_sample``
+    """
 
     # determine the number of NaNs in each ping and find the unique number of NaNs
     unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1))
 
+    # compute the results, if necessary, to allow for downstream checks
     if isinstance(unique_num_nans, dask.array.Array):
         unique_num_nans = unique_num_nans.compute()
 
@@ -433,6 +502,7 @@ def is_grouping_needed_less_comprehensive(er_chan):
         # make sure that the final echo_range value for each ping_time is the same (account for NaN)
         num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum()
 
+        # compute the results, if necessary, to allow for downstream checks
         if isinstance(num_non_nans, dask.array.Array):
             num_non_nans = num_non_nans.compute()
 

From ff095cdd2a6df2519203f5a2f788fb4ea8a0316e Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Thu, 10 Nov 2022 16:30:37 -0800
Subject: [PATCH 23/28] clean up bin_and_mean_2d code by creating a function
 for grouping, binning, and calculating the mean of an array with respect to
 echo_range

---
 echopype/preprocess/api.py | 89 +++++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 30 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 1cb19ad5a..e65f53a28 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -514,6 +514,60 @@ def is_grouping_needed_less_comprehensive(er_chan: Union[xr.DataArray, np.ndarra
             return False
 
 
+def group_bin_mean_echo_range(
+    arr: Union[np.ndarray, dask.array.Array],
+    digitized_echo_range: Union[np.ndarray, dask.array.Array],
+    n_bin_er: int,
+) -> Union[np.ndarray, dask.array.Array]:
+    """
+    Groups the rows of ``arr`` such that they have the same corresponding
+    row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range``
+    on each group, and lastly assembles the correctly ordered ``er_means`` array
+    representing the bin and mean of ``arr`` with respect to ``echo_range``.
+
+    Parameters
+    ----------
+    arr: dask.array.Array or np.ndarray
+        The 2D array whose values should be binned
+    digitized_echo_range: dask.array.Array or np.ndarray
+        2D array of bin indices for ``echo_range``
+    n_bin_er: int
+        The number of echo range bins
+
+    Returns
+    -------
+    er_means: dask.array.Array or np.ndarray
+        The bin and mean of ``arr`` with respect to ``echo_range``
+    """
+
+    # compute bin indices to allow for downstream processes (mainly axis argument in unique)
+    if isinstance(digitized_echo_range, dask.array.Array):
+        digitized_echo_range = digitized_echo_range.compute()
+
+    # determine the unique rows of digitized_echo_range and the inverse
+    unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True)
+
+    # create groups of row indices using the unique inverse
+    grps_same_ind = [
+        np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse)
+    ]
+
+    # for each group bin and mean arr along echo_range
+    # note: the values appended may not be in the correct final order
+    binned_er = []
+    for count, grp in enumerate(grps_same_ind):
+        binned_er.append(
+            bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er)
+        )
+
+    # construct er_means and put the columns in the correct order
+    binned_er_array = np.hstack(binned_er)
+    correct_column_ind = np.argsort(np.concatenate(grps_same_ind))
+    er_means = binned_er_array[:, correct_column_ind]
+
+    return er_means
+
+
 def bin_and_mean_2d(
     arr: Union[dask.array.Array, np.ndarray],
     bins_time: np.ndarray,
@@ -564,59 +618,34 @@ def bin_and_mean_2d(
     # obtain the bin indices for echo_range and times
     digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time)
 
-    # TODO: clean up the below code!
-
+    # determine if grouping of echo_range values with the same step size is necessary
     if comp_er_check:
         grouping_needed = is_grouping_needed_comprehensive(echo_range)
     else:
         grouping_needed = is_grouping_needed_less_comprehensive(echo_range)
 
-    # TODO: put the below into its own function
     if grouping_needed:
-
-        # compute bin indices to allow for downstream processes (mainly axis argument in unique)
-        if isinstance(digitized_echo_range, dask.array.Array):
-            digitized_echo_range = digitized_echo_range.compute()
-
-        unique_er_bin_ind, unique_inverse = np.unique(
-            digitized_echo_range, axis=0, return_inverse=True
-        )
-        grps_same_ind = [
-            np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse)
-        ]
-
-        binned_er = []  # the values appended may not be in the correct order
-        for count, grp in enumerate(grps_same_ind):
-            binned_er.append(
-                bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er)
-            )
-
-        # put the columns in the correct order
-        binned_er_array = np.hstack(binned_er)
-        correct_column_ind = np.argsort(np.concatenate(grps_same_ind))
-        er_means = binned_er_array[:, correct_column_ind]
-
+        # groups, bins, and means arr with respect to echo_range
+        er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er)
     else:
+        # bin and mean arr with respect to echo_range
         er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er)
 
     # if er_means is a dask array we compute it so the graph does not get too large
     if isinstance(er_means, dask.array.Array):
         er_means = er_means.compute()
 
-    # create final reduced array
+    # create final reduced array i.e. MVBS
     final = np.empty((n_bin_time, n_bin_er - 1))
-
     for bin_time in range(1, n_bin_time + 1):
 
         # obtain er_mean indices corresponding to the time bin
         indices = np.argwhere(bin_time_ind == bin_time).flatten()
 
         if len(indices) == 0:
-
             # fill values with NaN, if there are no values in the bin
             final[bin_time - 1, :] = np.nan
         else:
-
             # bin and mean the er_mean time bin
             final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1)
 

From 82687481fbb812777eb4420b14ba31e46f5a729b Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Wed, 23 Nov 2022 10:50:41 -0800
Subject: [PATCH 24/28] catch warnings associated with taking the nanmean of an
 array filled with NaNs

---
 echopype/preprocess/api.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index e65f53a28..29024ca8f 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -2,9 +2,11 @@
 Functions for enhancing the spatial and temporal coherence of data.
 """
 
+import warnings
 from typing import Tuple, Union
 
 import dask.array
+import dask.distributed
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -354,8 +356,14 @@ def bin_and_mean_echo_range(
     binned_means = []
     for bin_er in range(1, n_bin_er):
 
-        # bin and mean echo_range dimension
-        er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
+        # Catch a known warning that can occur, which does not impact the results
+        with warnings.catch_warnings():
+
+            # ignore warnings caused by taking a mean of an array filled with NaNs
+            warnings.filterwarnings(action="ignore", message="Mean of empty slice")
+
+            # bin and mean echo_range dimension
+            er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
 
         # collect all echo_range bins
         binned_means.append(er_selected_data)

From 7cadba27e08e66972f49ad32408bb8c2b969f367 Mon Sep 17 00:00:00 2001
From: b-reyes <53541061+b-reyes@users.noreply.github.com>
Date: Wed, 23 Nov 2022 10:57:49 -0800
Subject: [PATCH 25/28] provide more specific docstring for arr in
 bin_and_mean_echo_range

Co-authored-by: Wu-Jung Lee <leewujung@gmail.com>
---
 echopype/preprocess/api.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index 29024ca8f..bc258bc80 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -341,7 +341,8 @@ def bin_and_mean_echo_range(
     Parameters
     ----------
     arr: np.ndarray or dask.array.Array
-        2D array to bin and mean
+        2D array (dimension: [``echo_range`` x ``ping_time``]) to bin  along ``echo_range``
+        and compute mean of each bin
     digitized_echo_range: np.ndarray
         2D array of bin indices for ``echo_range``
     n_bin_er: int

From 32093d11f9b0c50019eb380d77dabd9f2ce8a1f2 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Wed, 23 Nov 2022 11:37:40 -0800
Subject: [PATCH 26/28] refactor and rename code associated with determining if
 grouping of echo_range is needed

---
 echopype/preprocess/api.py                   | 45 +++++++++++++++-----
 echopype/tests/preprocess/test_preprocess.py |  2 +-
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index bc258bc80..ba1f80b5b 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -414,7 +414,7 @@ def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray:
     return row_ind_not_equal
 
 
-def is_grouping_needed_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
+def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
     """
     A comprehensive check that determines if all ``echo_range`` values
     along ``ping_time`` have the same step size. If they do not have
@@ -465,7 +465,7 @@ def is_grouping_needed_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -
         return False
 
 
-def is_grouping_needed_less_comprehensive(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
+def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
     """
     An alternative (less comprehensive) check that determines if all
     ``echo_range`` values along ``ping_time`` have the same step size.
@@ -523,6 +523,34 @@ def is_grouping_needed_less_comprehensive(er_chan: Union[xr.DataArray, np.ndarra
             return False
 
 
+def is_er_grouping_needed(
+    echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool
+) -> bool:
+    """
+    Determines if ``echo_range`` values along ``ping_time`` can change and
+    thus need to be grouped.
+
+    Parameters
+    ----------
+    echo_range: xr.DataArray or np.ndarray
+        2D array containing the ``echo_range`` values for each ``ping_time``
+    comprehensive_er_check: bool
+        If True, a more comprehensive check will be completed to determine if ``echo_range``
+        grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done
+
+    Returns
+    -------
+    bool
+        If True grouping of ``echo_range`` will be required, else it will not
+        be necessary
+    """
+
+    if comprehensive_er_check:
+        return if_all_er_steps_identical(echo_range)
+    else:
+        return if_last_er_steps_identical(echo_range)
+
+
 def group_bin_mean_echo_range(
     arr: Union[np.ndarray, dask.array.Array],
     digitized_echo_range: Union[np.ndarray, dask.array.Array],
@@ -583,7 +611,7 @@ def bin_and_mean_2d(
     bins_er: np.ndarray,
     times: np.ndarray,
     echo_range: np.ndarray,
-    comp_er_check: bool = True,
+    comprehensive_er_check: bool = True,
 ) -> np.ndarray:
     """
     Bins and means ``arr`` based on ``times`` and ``echo_range``,
@@ -602,7 +630,7 @@ def bin_and_mean_2d(
         1D array corresponding to the time values that should be binned
     echo_range: np.ndarray
         2D array of echo range values
-    comp_er_check: bool
+    comprehensive_er_check: bool
         If True, a more comprehensive check will be completed to determine if ``echo_range``
         grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done
 
@@ -628,12 +656,9 @@ def bin_and_mean_2d(
     digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time)
 
     # determine if grouping of echo_range values with the same step size is necessary
-    if comp_er_check:
-        grouping_needed = is_grouping_needed_comprehensive(echo_range)
-    else:
-        grouping_needed = is_grouping_needed_less_comprehensive(echo_range)
+    er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check)
 
-    if grouping_needed:
+    if er_grouping_needed:
         # groups, bins, and means arr with respect to echo_range
         er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er)
     else:
@@ -709,7 +734,7 @@ def get_MVBS_along_channels(
             bins_er=echo_range_interval,
             times=sv.ping_time.data,
             echo_range=ds["echo_range"],
-            comp_er_check=True,
+            comprehensive_er_check=True,
         )
 
         # apply inverse mapping to get back to the original domain and store values
diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index 40eb058ca..e9ae00f35 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -924,7 +924,7 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None:
     # calculate MVBS for mock data set
     calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin,
                                 bins_er=digitize_er_bin, times=final_ping_time,
-                                echo_range=final_er, comp_er_check=True)
+                                echo_range=final_er, comprehensive_er_check=True)
 
     # compare known MVBS solution against its calculated counterpart
     assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True)

From 94d640de4cd81d0d9959d8ef8017db7bddbae718 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Wed, 23 Nov 2022 11:55:37 -0800
Subject: [PATCH 27/28] move all core routines needed for compute_MVBS into the
 python script mvbs.py

---
 echopype/preprocess/api.py                   | 462 +-----------------
 echopype/preprocess/mvbs.py                  | 466 +++++++++++++++++++
 echopype/tests/preprocess/test_preprocess.py |   2 +-
 3 files changed, 468 insertions(+), 462 deletions(-)
 create mode 100644 echopype/preprocess/mvbs.py

diff --git a/echopype/preprocess/api.py b/echopype/preprocess/api.py
index ba1f80b5b..27a0453fc 100644
--- a/echopype/preprocess/api.py
+++ b/echopype/preprocess/api.py
@@ -2,16 +2,12 @@
 Functions for enhancing the spatial and temporal coherence of data.
 """
 
-import warnings
-from typing import Tuple, Union
-
-import dask.array
-import dask.distributed
 import numpy as np
 import pandas as pd
 import xarray as xr
 
 from ..utils.prov import echopype_prov_attrs
+from .mvbs import get_MVBS_along_channels
 from .noise_est import NoiseEst
 
 
@@ -286,459 +282,3 @@ def remove_noise(ds_Sv, ping_num, range_sample_num, noise_max=None, SNR_threshol
 
 def regrid():
     return 1
-
-
-def get_bin_indices(
-    echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray
-) -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Obtains the bin index of ``echo_range`` and ``times`` based
-    on the binning ``bins_er`` and ``bins_time``, respectively.
-
-    Parameters
-    ----------
-    echo_range: np.ndarray
-        2D array of echo range values
-    bins_er: np.ndarray
-        1D array (used by np.digitize) representing the binning required for ``echo_range``
-    times: np.ndarray
-        1D array corresponding to the time values that should be binned
-    bins_time: np.ndarray
-        1D array (used by np.digitize) representing the binning required for ``times``
-
-    Returns
-    -------
-    digitized_echo_range: np.ndarray
-        2D array of bin indices for ``echo_range``
-    bin_time_ind: np.ndarray
-        1D array of bin indices for ``times``
-    """
-
-    # get bin index for each echo range value
-    digitized_echo_range = np.digitize(echo_range, bins_er, right=False)
-
-    # turn datetime into integers, so we can use np.digitize
-    if isinstance(times, dask.array.Array):
-        times_i8 = times.compute().data.view("i8")
-    else:
-        times_i8 = times.view("i8")
-
-    # turn datetime into integers, so we can use np.digitize
-    bins_time_i8 = bins_time.view("i8")
-
-    # get bin index for each time
-    bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False)
-
-    return digitized_echo_range, bin_time_ind
-
-
-def bin_and_mean_echo_range(
-    arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int
-) -> Union[np.ndarray, dask.array.Array]:
-    """
-    Bins and means ``arr`` with respect to the ``echo_range`` bins.
-
-    Parameters
-    ----------
-    arr: np.ndarray or dask.array.Array
-        2D array (dimension: [``echo_range`` x ``ping_time``]) to bin  along ``echo_range``
-        and compute mean of each bin
-    digitized_echo_range: np.ndarray
-        2D array of bin indices for ``echo_range``
-    n_bin_er: int
-        The number of echo range bins
-
-    Returns
-    -------
-    er_means: np.ndarray or dask.array.Array
-        2D array representing the bin and mean of ``arr`` along ``echo_range``
-    """
-
-    binned_means = []
-    for bin_er in range(1, n_bin_er):
-
-        # Catch a known warning that can occur, which does not impact the results
-        with warnings.catch_warnings():
-
-            # ignore warnings caused by taking a mean of an array filled with NaNs
-            warnings.filterwarnings(action="ignore", message="Mean of empty slice")
-
-            # bin and mean echo_range dimension
-            er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
-
-        # collect all echo_range bins
-        binned_means.append(er_selected_data)
-
-    # create full echo_range binned array
-    er_means = np.vstack(binned_means)
-
-    return er_means
-
-
-def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray:
-    """
-    Obtains those row indices of ``mat`` that are not equal
-    to ``row``.
-
-    Parameters
-    ----------
-    mat: np.ndarray
-        2D array with the same column dimension as the number
-        of elements in ``row``
-    row: np.ndarray
-        1D array with the same number of element elements as
-        the column dimension of ``mat``
-
-    Returns
-    -------
-    row_ind_not_equal: np.ndarray
-        The row indices of ``mat`` that are not equal to ``row``
-
-    Notes
-    -----
-    Elements with NaNs are considered equal if they are in the same position.
-    """
-
-    # compare row against all rows in mat (allowing for NaNs to be equal)
-    element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row))
-
-    # determine if mat row is equal to row
-    row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1))
-
-    if isinstance(row_not_equal, dask.array.Array):
-        row_not_equal = row_not_equal.compute()
-
-    # get those row indices that are not equal to row
-    row_ind_not_equal = np.argwhere(row_not_equal).flatten()
-
-    return row_ind_not_equal
-
-
-def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
-    """
-    A comprehensive check that determines if all ``echo_range`` values
-    along ``ping_time`` have the same step size. If they do not have
-    the same step sizes, then grouping of the ``echo_range`` values
-    will be necessary.
-
-    Parameters
-    ----------
-    er_chan: xr.DataArray or np.ndarray
-        2D array containing the ``echo_range`` values for each ``ping_time``
-
-    Returns
-    -------
-    bool
-        True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False
-
-    Notes
-    -----
-    ``er_chan`` should have rows corresponding to ``ping_time`` and columns
-    corresponding to ``range_sample``
-    """
-
-    # grab the in-memory numpy echo_range values, if necessary
-    if isinstance(er_chan, xr.DataArray):
-        er_chan = er_chan.values
-
-    # grab the first ping_time that is not filled with NaNs
-    ping_index = 0
-    while np.all(np.isnan(er_chan[ping_index, :])):
-        ping_index += 1
-
-    # determine those rows of er_chan that are not equal to the row ping_index
-    unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :])
-
-    if len(unequal_ping_ind) > 0:
-
-        # see if all unequal_ping_ind are filled with NaNs
-        all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1))
-
-        if all_nans:
-            # All echo_range values have the same step size
-            return False
-        else:
-            # Some echo_range values have different step sizes
-            return True
-    else:
-        # All echo_range values have the same step size
-        return False
-
-
-def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
-    """
-    An alternative (less comprehensive) check that determines if all
-    ``echo_range`` values along ``ping_time`` have the same step size.
-    If they do not have the same step sizes, then grouping of the
-    ``echo_range`` values will be necessary.
-
-    Parameters
-    ----------
-    er_chan: xr.DataArray or np.ndarray
-        2D array containing the ``echo_range`` values for each ``ping_time``
-
-    Returns
-    -------
-    bool
-        True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False
-
-    Notes
-    -----
-    It is possible that this method will incorrectly determine if grouping
-    is necessary.
-
-    ``er_chan`` should have rows corresponding to ``ping_time`` and columns
-    corresponding to ``range_sample``
-    """
-
-    # determine the number of NaNs in each ping and find the unique number of NaNs
-    unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1))
-
-    # compute the results, if necessary, to allow for downstream checks
-    if isinstance(unique_num_nans, dask.array.Array):
-        unique_num_nans = unique_num_nans.compute()
-
-    # determine if any value is not 0 or er_chan.shape[1]
-    unexpected_num_nans = False in np.logical_or(
-        unique_num_nans == 0, unique_num_nans == er_chan.shape[1]
-    )
-
-    if unexpected_num_nans:
-        # echo_range varies with ping_time
-        return True
-    else:
-
-        # make sure that the final echo_range value for each ping_time is the same (account for NaN)
-        num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum()
-
-        # compute the results, if necessary, to allow for downstream checks
-        if isinstance(num_non_nans, dask.array.Array):
-            num_non_nans = num_non_nans.compute()
-
-        if num_non_nans > 1:
-            # echo_range varies with ping_time
-            return True
-        else:
-            # echo_range does not vary with ping_time
-            return False
-
-
-def is_er_grouping_needed(
-    echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool
-) -> bool:
-    """
-    Determines if ``echo_range`` values along ``ping_time`` can change and
-    thus need to be grouped.
-
-    Parameters
-    ----------
-    echo_range: xr.DataArray or np.ndarray
-        2D array containing the ``echo_range`` values for each ``ping_time``
-    comprehensive_er_check: bool
-        If True, a more comprehensive check will be completed to determine if ``echo_range``
-        grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done
-
-    Returns
-    -------
-    bool
-        If True grouping of ``echo_range`` will be required, else it will not
-        be necessary
-    """
-
-    if comprehensive_er_check:
-        return if_all_er_steps_identical(echo_range)
-    else:
-        return if_last_er_steps_identical(echo_range)
-
-
-def group_bin_mean_echo_range(
-    arr: Union[np.ndarray, dask.array.Array],
-    digitized_echo_range: Union[np.ndarray, dask.array.Array],
-    n_bin_er: int,
-) -> Union[np.ndarray, dask.array.Array]:
-    """
-    Groups the rows of ``arr`` such that they have the same corresponding
-    row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range``
-    on each group, and lastly assembles the correctly ordered ``er_means`` array
-    representing the bin and mean of ``arr`` with respect to ``echo_range``.
-
-    Parameters
-    ----------
-    arr: dask.array.Array or np.ndarray
-        The 2D array whose values should be binned
-    digitized_echo_range: dask.array.Array or np.ndarray
-        2D array of bin indices for ``echo_range``
-    n_bin_er: int
-        The number of echo range bins
-
-    Returns
-    -------
-    er_means: dask.array.Array or np.ndarray
-        The bin and mean of ``arr`` with respect to ``echo_range``
-    """
-
-    # compute bin indices to allow for downstream processes (mainly axis argument in unique)
-    if isinstance(digitized_echo_range, dask.array.Array):
-        digitized_echo_range = digitized_echo_range.compute()
-
-    # determine the unique rows of digitized_echo_range and the inverse
-    unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True)
-
-    # create groups of row indices using the unique inverse
-    grps_same_ind = [
-        np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse)
-    ]
-
-    # for each group bin and mean arr along echo_range
-    # note: the values appended may not be in the correct final order
-    binned_er = []
-    for count, grp in enumerate(grps_same_ind):
-        binned_er.append(
-            bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er)
-        )
-
-    # construct er_means and put the columns in the correct order
-    binned_er_array = np.hstack(binned_er)
-    correct_column_ind = np.argsort(np.concatenate(grps_same_ind))
-    er_means = binned_er_array[:, correct_column_ind]
-
-    return er_means
-
-
-def bin_and_mean_2d(
-    arr: Union[dask.array.Array, np.ndarray],
-    bins_time: np.ndarray,
-    bins_er: np.ndarray,
-    times: np.ndarray,
-    echo_range: np.ndarray,
-    comprehensive_er_check: bool = True,
-) -> np.ndarray:
-    """
-    Bins and means ``arr`` based on ``times`` and ``echo_range``,
-    and their corresponding bins. If ``arr`` is ``Sv`` then this
-    will compute the MVBS.
-
-    Parameters
-    ----------
-    arr: dask.array.Array or np.ndarray
-        The 2D array whose values should be binned
-    bins_time: np.ndarray
-        1D array (used by np.digitize) representing the binning required for ``times``
-    bins_er: np.ndarray
-        1D array (used by np.digitize) representing the binning required for ``echo_range``
-    times: np.ndarray
-        1D array corresponding to the time values that should be binned
-    echo_range: np.ndarray
-        2D array of echo range values
-    comprehensive_er_check: bool
-        If True, a more comprehensive check will be completed to determine if ``echo_range``
-        grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done
-
-    Returns
-    -------
-    final_reduced: np.ndarray
-        The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS
-
-    Notes
-    -----
-    This function assumes that ``arr`` has rows corresponding to
-    ``ping_time`` and columns corresponding to ``echo_range``.
-
-    This function should not be run if the number of ``echo_range`` values
-    vary amongst ``ping_times``.
-    """
-
-    # get the number of echo range and time bins
-    n_bin_er = len(bins_er)
-    n_bin_time = len(bins_time)
-
-    # obtain the bin indices for echo_range and times
-    digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time)
-
-    # determine if grouping of echo_range values with the same step size is necessary
-    er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check)
-
-    if er_grouping_needed:
-        # groups, bins, and means arr with respect to echo_range
-        er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er)
-    else:
-        # bin and mean arr with respect to echo_range
-        er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er)
-
-    # if er_means is a dask array we compute it so the graph does not get too large
-    if isinstance(er_means, dask.array.Array):
-        er_means = er_means.compute()
-
-    # create final reduced array i.e. MVBS
-    final = np.empty((n_bin_time, n_bin_er - 1))
-    for bin_time in range(1, n_bin_time + 1):
-
-        # obtain er_mean indices corresponding to the time bin
-        indices = np.argwhere(bin_time_ind == bin_time).flatten()
-
-        if len(indices) == 0:
-            # fill values with NaN, if there are no values in the bin
-            final[bin_time - 1, :] = np.nan
-        else:
-            # bin and mean the er_mean time bin
-            final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1)
-
-    return final
-
-
-def get_MVBS_along_channels(
-    ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray
-) -> np.ndarray:
-    """
-    Computes the MVBS of ``ds_Sv`` along each channel for the given
-    intervals.
-
-    Parameters
-    ----------
-    ds_Sv: xr.Dataset
-        A Dataset containing ``Sv`` and ``echo_range`` data with coordinates
-        ``channel``, ``ping_time``, and ``range_sample``
-    echo_range_interval: np.ndarray
-        1D array (used by np.digitize) representing the binning required for ``echo_range``
-    ping_interval: np.ndarray
-        1D array (used by np.digitize) representing the binning required for ``ping_time``
-
-    Returns
-    -------
-    np.ndarray
-        The MVBS value of the input ``ds_Sv`` for all channels
-
-    Notes
-    -----
-    If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with
-    respect to ``echo_range`` will take place, then the delayed result will be computed,
-    and lastly the binning and mean with respect to ``ping_time`` will be completed. It
-    is necessary to apply a compute midway through this method because Dask graph layers
-    get too large and this makes downstream operations very inefficient.
-    """
-
-    all_MVBS = []
-    for chan in ds_Sv.channel:
-
-        # squeeze to remove "channel" dim if present
-        # TODO: not sure why not already removed for the AZFP case. Investigate.
-        ds = ds_Sv.sel(channel=chan).squeeze()
-
-        # average should be done in linear domain
-        sv = 10 ** (ds["Sv"] / 10)
-
-        # get MVBS for channel in linear domain
-        chan_MVBS = bin_and_mean_2d(
-            sv.data,
-            bins_time=ping_interval,
-            bins_er=echo_range_interval,
-            times=sv.ping_time.data,
-            echo_range=ds["echo_range"],
-            comprehensive_er_check=True,
-        )
-
-        # apply inverse mapping to get back to the original domain and store values
-        all_MVBS.append(10 * np.log10(chan_MVBS))
-
-    # collect the MVBS values for each channel
-    return np.stack(all_MVBS, axis=0)
diff --git a/echopype/preprocess/mvbs.py b/echopype/preprocess/mvbs.py
new file mode 100644
index 000000000..a6d548fe5
--- /dev/null
+++ b/echopype/preprocess/mvbs.py
@@ -0,0 +1,466 @@
+"""
+Contains core functions needed to compute the MVBS of an input dataset.
+"""
+
+import warnings
+from typing import Tuple, Union
+
+import dask.array
+import numpy as np
+import xarray as xr
+
+
+def get_bin_indices(
+    echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Obtains the bin index of ``echo_range`` and ``times`` based
+    on the binning ``bins_er`` and ``bins_time``, respectively.
+
+    Parameters
+    ----------
+    echo_range: np.ndarray
+        2D array of echo range values
+    bins_er: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``echo_range``
+    times: np.ndarray
+        1D array corresponding to the time values that should be binned
+    bins_time: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``times``
+
+    Returns
+    -------
+    digitized_echo_range: np.ndarray
+        2D array of bin indices for ``echo_range``
+    bin_time_ind: np.ndarray
+        1D array of bin indices for ``times``
+    """
+
+    # get bin index for each echo range value
+    digitized_echo_range = np.digitize(echo_range, bins_er, right=False)
+
+    # turn datetime into integers, so we can use np.digitize
+    if isinstance(times, dask.array.Array):
+        times_i8 = times.compute().data.view("i8")
+    else:
+        times_i8 = times.view("i8")
+
+    # turn datetime into integers, so we can use np.digitize
+    bins_time_i8 = bins_time.view("i8")
+
+    # get bin index for each time
+    bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False)
+
+    return digitized_echo_range, bin_time_ind
+
+
+def bin_and_mean_echo_range(
+    arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int
+) -> Union[np.ndarray, dask.array.Array]:
+    """
+    Bins and means ``arr`` with respect to the ``echo_range`` bins.
+
+    Parameters
+    ----------
+    arr: np.ndarray or dask.array.Array
+        2D array (dimension: [``echo_range`` x ``ping_time``]) to bin  along ``echo_range``
+        and compute mean of each bin
+    digitized_echo_range: np.ndarray
+        2D array of bin indices for ``echo_range``
+    n_bin_er: int
+        The number of echo range bins
+
+    Returns
+    -------
+    er_means: np.ndarray or dask.array.Array
+        2D array representing the bin and mean of ``arr`` along ``echo_range``
+    """
+
+    binned_means = []
+    for bin_er in range(1, n_bin_er):
+
+        # Catch a known warning that can occur, which does not impact the results
+        with warnings.catch_warnings():
+
+            # ignore warnings caused by taking a mean of an array filled with NaNs
+            warnings.filterwarnings(action="ignore", message="Mean of empty slice")
+
+            # bin and mean echo_range dimension
+            er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1)
+
+        # collect all echo_range bins
+        binned_means.append(er_selected_data)
+
+    # create full echo_range binned array
+    er_means = np.vstack(binned_means)
+
+    return er_means
+
+
+def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray:
+    """
+    Obtains those row indices of ``mat`` that are not equal
+    to ``row``.
+
+    Parameters
+    ----------
+    mat: np.ndarray
+        2D array with the same column dimension as the number
+        of elements in ``row``
+    row: np.ndarray
+        1D array with the same number of element elements as
+        the column dimension of ``mat``
+
+    Returns
+    -------
+    row_ind_not_equal: np.ndarray
+        The row indices of ``mat`` that are not equal to ``row``
+
+    Notes
+    -----
+    Elements with NaNs are considered equal if they are in the same position.
+    """
+
+    # compare row against all rows in mat (allowing for NaNs to be equal)
+    element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row))
+
+    # determine if mat row is equal to row
+    row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1))
+
+    if isinstance(row_not_equal, dask.array.Array):
+        row_not_equal = row_not_equal.compute()
+
+    # get those row indices that are not equal to row
+    row_ind_not_equal = np.argwhere(row_not_equal).flatten()
+
+    return row_ind_not_equal
+
+
+def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
+    """
+    A comprehensive check that determines if all ``echo_range`` values
+    along ``ping_time`` have the same step size. If they do not have
+    the same step sizes, then grouping of the ``echo_range`` values
+    will be necessary.
+
+    Parameters
+    ----------
+    er_chan: xr.DataArray or np.ndarray
+        2D array containing the ``echo_range`` values for each ``ping_time``
+
+    Returns
+    -------
+    bool
+        True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False
+
+    Notes
+    -----
+    ``er_chan`` should have rows corresponding to ``ping_time`` and columns
+    corresponding to ``range_sample``
+    """
+
+    # grab the in-memory numpy echo_range values, if necessary
+    if isinstance(er_chan, xr.DataArray):
+        er_chan = er_chan.values
+
+    # grab the first ping_time that is not filled with NaNs
+    ping_index = 0
+    while np.all(np.isnan(er_chan[ping_index, :])):
+        ping_index += 1
+
+    # determine those rows of er_chan that are not equal to the row ping_index
+    unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :])
+
+    if len(unequal_ping_ind) > 0:
+
+        # see if all unequal_ping_ind are filled with NaNs
+        all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1))
+
+        if all_nans:
+            # All echo_range values have the same step size
+            return False
+        else:
+            # Some echo_range values have different step sizes
+            return True
+    else:
+        # All echo_range values have the same step size
+        return False
+
+
+def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool:
+    """
+    An alternative (less comprehensive) check that determines if all
+    ``echo_range`` values along ``ping_time`` have the same step size.
+    If they do not have the same step sizes, then grouping of the
+    ``echo_range`` values will be necessary.
+
+    Parameters
+    ----------
+    er_chan: xr.DataArray or np.ndarray
+        2D array containing the ``echo_range`` values for each ``ping_time``
+
+    Returns
+    -------
+    bool
+        True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False
+
+    Notes
+    -----
+    It is possible that this method will incorrectly determine if grouping
+    is necessary.
+
+    ``er_chan`` should have rows corresponding to ``ping_time`` and columns
+    corresponding to ``range_sample``
+    """
+
+    # determine the number of NaNs in each ping and find the unique number of NaNs
+    unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1))
+
+    # compute the results, if necessary, to allow for downstream checks
+    if isinstance(unique_num_nans, dask.array.Array):
+        unique_num_nans = unique_num_nans.compute()
+
+    # determine if any value is not 0 or er_chan.shape[1]
+    unexpected_num_nans = False in np.logical_or(
+        unique_num_nans == 0, unique_num_nans == er_chan.shape[1]
+    )
+
+    if unexpected_num_nans:
+        # echo_range varies with ping_time
+        return True
+    else:
+
+        # make sure that the final echo_range value for each ping_time is the same (account for NaN)
+        num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum()
+
+        # compute the results, if necessary, to allow for downstream checks
+        if isinstance(num_non_nans, dask.array.Array):
+            num_non_nans = num_non_nans.compute()
+
+        if num_non_nans > 1:
+            # echo_range varies with ping_time
+            return True
+        else:
+            # echo_range does not vary with ping_time
+            return False
+
+
+def is_er_grouping_needed(
+    echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool
+) -> bool:
+    """
+    Determines if ``echo_range`` values along ``ping_time`` can change and
+    thus need to be grouped.
+
+    Parameters
+    ----------
+    echo_range: xr.DataArray or np.ndarray
+        2D array containing the ``echo_range`` values for each ``ping_time``
+    comprehensive_er_check: bool
+        If True, a more comprehensive check will be completed to determine if ``echo_range``
+        grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done
+
+    Returns
+    -------
+    bool
+        If True grouping of ``echo_range`` will be required, else it will not
+        be necessary
+    """
+
+    if comprehensive_er_check:
+        return if_all_er_steps_identical(echo_range)
+    else:
+        return if_last_er_steps_identical(echo_range)
+
+
+def group_bin_mean_echo_range(
+    arr: Union[np.ndarray, dask.array.Array],
+    digitized_echo_range: Union[np.ndarray, dask.array.Array],
+    n_bin_er: int,
+) -> Union[np.ndarray, dask.array.Array]:
+    """
+    Groups the rows of ``arr`` such that they have the same corresponding
+    row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range``
+    on each group, and lastly assembles the correctly ordered ``er_means`` array
+    representing the bin and mean of ``arr`` with respect to ``echo_range``.
+
+    Parameters
+    ----------
+    arr: dask.array.Array or np.ndarray
+        The 2D array whose values should be binned
+    digitized_echo_range: dask.array.Array or np.ndarray
+        2D array of bin indices for ``echo_range``
+    n_bin_er: int
+        The number of echo range bins
+
+    Returns
+    -------
+    er_means: dask.array.Array or np.ndarray
+        The bin and mean of ``arr`` with respect to ``echo_range``
+    """
+
+    # compute bin indices to allow for downstream processes (mainly axis argument in unique)
+    if isinstance(digitized_echo_range, dask.array.Array):
+        digitized_echo_range = digitized_echo_range.compute()
+
+    # determine the unique rows of digitized_echo_range and the inverse
+    unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True)
+
+    # create groups of row indices using the unique inverse
+    grps_same_ind = [
+        np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse)
+    ]
+
+    # for each group bin and mean arr along echo_range
+    # note: the values appended may not be in the correct final order
+    binned_er = []
+    for count, grp in enumerate(grps_same_ind):
+        binned_er.append(
+            bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er)
+        )
+
+    # construct er_means and put the columns in the correct order
+    binned_er_array = np.hstack(binned_er)
+    correct_column_ind = np.argsort(np.concatenate(grps_same_ind))
+    er_means = binned_er_array[:, correct_column_ind]
+
+    return er_means
+
+
+def bin_and_mean_2d(
+    arr: Union[dask.array.Array, np.ndarray],
+    bins_time: np.ndarray,
+    bins_er: np.ndarray,
+    times: np.ndarray,
+    echo_range: np.ndarray,
+    comprehensive_er_check: bool = True,
+) -> np.ndarray:
+    """
+    Bins and means ``arr`` based on ``times`` and ``echo_range``,
+    and their corresponding bins. If ``arr`` is ``Sv`` then this
+    will compute the MVBS.
+
+    Parameters
+    ----------
+    arr: dask.array.Array or np.ndarray
+        The 2D array whose values should be binned
+    bins_time: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``times``
+    bins_er: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``echo_range``
+    times: np.ndarray
+        1D array corresponding to the time values that should be binned
+    echo_range: np.ndarray
+        2D array of echo range values
+    comprehensive_er_check: bool
+        If True, a more comprehensive check will be completed to determine if ``echo_range``
+        grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done
+
+    Returns
+    -------
+    final_reduced: np.ndarray
+        The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS
+
+    Notes
+    -----
+    This function assumes that ``arr`` has rows corresponding to
+    ``ping_time`` and columns corresponding to ``echo_range``.
+
+    This function should not be run if the number of ``echo_range`` values
+    vary amongst ``ping_times``.
+    """
+
+    # get the number of echo range and time bins
+    n_bin_er = len(bins_er)
+    n_bin_time = len(bins_time)
+
+    # obtain the bin indices for echo_range and times
+    digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time)
+
+    # determine if grouping of echo_range values with the same step size is necessary
+    er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check)
+
+    if er_grouping_needed:
+        # groups, bins, and means arr with respect to echo_range
+        er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er)
+    else:
+        # bin and mean arr with respect to echo_range
+        er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er)
+
+    # if er_means is a dask array we compute it so the graph does not get too large
+    if isinstance(er_means, dask.array.Array):
+        er_means = er_means.compute()
+
+    # create final reduced array i.e. MVBS
+    final = np.empty((n_bin_time, n_bin_er - 1))
+    for bin_time in range(1, n_bin_time + 1):
+
+        # obtain er_mean indices corresponding to the time bin
+        indices = np.argwhere(bin_time_ind == bin_time).flatten()
+
+        if len(indices) == 0:
+            # fill values with NaN, if there are no values in the bin
+            final[bin_time - 1, :] = np.nan
+        else:
+            # bin and mean the er_mean time bin
+            final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1)
+
+    return final
+
+
+def get_MVBS_along_channels(
+    ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray
+) -> np.ndarray:
+    """
+    Computes the MVBS of ``ds_Sv`` along each channel for the given
+    intervals.
+
+    Parameters
+    ----------
+    ds_Sv: xr.Dataset
+        A Dataset containing ``Sv`` and ``echo_range`` data with coordinates
+        ``channel``, ``ping_time``, and ``range_sample``
+    echo_range_interval: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``echo_range``
+    ping_interval: np.ndarray
+        1D array (used by np.digitize) representing the binning required for ``ping_time``
+
+    Returns
+    -------
+    np.ndarray
+        The MVBS value of the input ``ds_Sv`` for all channels
+
+    Notes
+    -----
+    If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with
+    respect to ``echo_range`` will take place, then the delayed result will be computed,
+    and lastly the binning and mean with respect to ``ping_time`` will be completed. It
+    is necessary to apply a compute midway through this method because Dask graph layers
+    get too large and this makes downstream operations very inefficient.
+    """
+
+    all_MVBS = []
+    for chan in ds_Sv.channel:
+
+        # squeeze to remove "channel" dim if present
+        # TODO: not sure why not already removed for the AZFP case. Investigate.
+        ds = ds_Sv.sel(channel=chan).squeeze()
+
+        # average should be done in linear domain
+        sv = 10 ** (ds["Sv"] / 10)
+
+        # get MVBS for channel in linear domain
+        chan_MVBS = bin_and_mean_2d(
+            sv.data,
+            bins_time=ping_interval,
+            bins_er=echo_range_interval,
+            times=sv.ping_time.data,
+            echo_range=ds["echo_range"],
+            comprehensive_er_check=True,
+        )
+
+        # apply inverse mapping to get back to the original domain and store values
+        all_MVBS.append(10 * np.log10(chan_MVBS))
+
+    # collect the MVBS values for each channel
+    return np.stack(all_MVBS, axis=0)
diff --git a/echopype/tests/preprocess/test_preprocess.py b/echopype/tests/preprocess/test_preprocess.py
index e9ae00f35..4e96986b3 100644
--- a/echopype/tests/preprocess/test_preprocess.py
+++ b/echopype/tests/preprocess/test_preprocess.py
@@ -6,7 +6,7 @@
 import dask.array
 from numpy.random import default_rng
 
-from echopype.preprocess.api import bin_and_mean_2d
+from echopype.preprocess.mvbs import bin_and_mean_2d
 from typing import Tuple, Iterable, Union
 
 

From 7b8fb36ff225092a491e07b361131663172fbf90 Mon Sep 17 00:00:00 2001
From: b-reyes <reyesb123@gmail.com>
Date: Wed, 23 Nov 2022 13:24:28 -0800
Subject: [PATCH 28/28] add note to docstring of bin_and_mean_2d and rename
 group_bin_mean_echo_range to group_dig_er_bin_mean_echo_range

---
 echopype/preprocess/mvbs.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/echopype/preprocess/mvbs.py b/echopype/preprocess/mvbs.py
index a6d548fe5..a08ebf8a9 100644
--- a/echopype/preprocess/mvbs.py
+++ b/echopype/preprocess/mvbs.py
@@ -273,7 +273,7 @@ def is_er_grouping_needed(
         return if_last_er_steps_identical(echo_range)
 
 
-def group_bin_mean_echo_range(
+def group_dig_er_bin_mean_echo_range(
     arr: Union[np.ndarray, dask.array.Array],
     digitized_echo_range: Union[np.ndarray, dask.array.Array],
     n_bin_er: int,
@@ -367,7 +367,8 @@ def bin_and_mean_2d(
     ``ping_time`` and columns corresponding to ``echo_range``.
 
     This function should not be run if the number of ``echo_range`` values
-    vary amongst ``ping_times``.
+    vary amongst ``ping_times``. This should not occur for our current use
+    of echopype-generated Sv data.
     """
 
     # get the number of echo range and time bins
@@ -382,7 +383,7 @@ def bin_and_mean_2d(
 
     if er_grouping_needed:
         # groups, bins, and means arr with respect to echo_range
-        er_means = group_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er)
+        er_means = group_dig_er_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er)
     else:
         # bin and mean arr with respect to echo_range
         er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er)