mitre
diff --git a/‎growthviz-data/.DS_Store
-6 KB b/‎growthviz-data/.DS_Store
-6 KB
diff --git a/‎growthviz-data/ext/growthfile_cdc_ext.csv.gz
973 KB b/‎growthviz-data/ext/growthfile_cdc_ext.csv.gz
973 KB
diff --git a/‎growthviz-data/ext/growthfile_who.csv.gz
398 KB b/‎growthviz-data/ext/growthfile_who.csv.gz
398 KB
diff --git a/‎growthviz-data/ext/swaps.csv
-21,195 b/‎growthviz-data/ext/swaps.csv
-21,195
diff --git a/‎growthviz/charts.py
+33-31 b/‎growthviz/charts.py
+33-31
diff --git a/‎growthviz/compare.py
+1-1 b/‎growthviz/compare.py
+1-1
diff --git a/‎growthviz/processdata.py
+44-32 b/‎growthviz/processdata.py
+44-32
diff --git a/‎growthviz/sumstats.py
+2-2 b/‎growthviz/sumstats.py
+2-2
diff --git a/‎requirements.txt
+1 b/‎requirements.txt
+1
@@ -45,19 +45,20 @@ def weight_distr(df, mode):
     Create charts with overall and outlier weight distributions (included values only)
 
     Parameters:
-    df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat, include,
-        category, colors, patterns, and sort_order columns
-    mode: (str) indicates how many of the weights you want to use. If set to 'high', the function
-        will only use weights above a certain threshold. Otherwise, it displays all the weights.
+    df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, and
+        include columns
+    mode: (str) indicates how many of the weights you want to use. If set to 'high',
+        the function will only use weights above a certain threshold. Otherwise, it
+        displays all the weights.
     """
-    wgt_grp = df[(df["param"] == "WEIGHTKG") & (df["include"] == True)]
+    wgt_grp = df[(df["param"] == "WEIGHTKG") & (df["include"] is True)]
     if mode == "high":
         wgt_grp = wgt_grp.loc[wgt_grp["measurement"] >= 135]
         plt.title("Weights At or Above 135kg")
     else:
         plt.title("All Weights")
     if len(wgt_grp.index) == 0:
-        print("No included observations with weight (kg) >= 135")
+        print("No included observations with weight (kg) >= 135.")
         plt.close()
     else:
         round_col = wgt_grp.apply(
@@ -66,7 +67,7 @@ def weight_distr(df, mode):
         wgt_grp = wgt_grp.assign(round_weight=round_col.values)
         wgt_grp_sum = wgt_grp.groupby("round_weight")["subjid"].count().reset_index()
         plt.rcParams["figure.figsize"] = [7, 5]
-        wgt_grp_sum_plot = plt.bar(wgt_grp_sum["round_weight"], wgt_grp_sum["subjid"])
+        plt.bar(wgt_grp_sum["round_weight"], wgt_grp_sum["subjid"])
         # Assure there is some breadth to the x-axis in case of just a few observations
         if wgt_grp["measurement"].max() - wgt_grp["measurement"].min() < 10:
             plt.xlim(wgt_grp["measurement"].min() - 5, wgt_grp["measurement"].max() + 5)
@@ -78,11 +79,12 @@ def weight_distr(df, mode):
 
 def make_age_charts(df, mode):
     """
-    Creates a chart with the age ranges in the dataset. Counts the number of subjids in each range.
+    Creates a chart with the age ranges in the dataset. Counts the number of subjids in
+    each range.
 
     Parameters:
-    df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat, include,
-        category, colors, patterns, and sort_order columns
+    df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat,
+        include, category, colors, patterns, and sort_order columns
     mode: (str) indicates whether you want the adults or pediatrics values.
     """
     obs_grp = df
@@ -94,8 +96,8 @@ def make_age_charts(df, mode):
     else:
         raise Exception("Valid modes are 'adults' and 'pediatrics'")
 
-    # Adds label, color, pattern and sort order columns to the dataframe based on the age of each
-    # row in the dataframe
+    # Adds label, color, pattern and sort order columns to the dataframe based on the
+    # age of each row in the dataframe
     def add_categories_to_frame(df_data, df_reference):
         categories = []
         colors = []
@@ -119,8 +121,8 @@ def add_categories_to_frame(df_data, df_reference):
     # Call the categorizing function on the data
     obs_grp = add_categories_to_frame(obs_grp, label_frame)
 
-    # Groups the new dataframe by category, sort order, colors and patterns. It then counts the
-    # number of subject ids in each group and sorts the values by sort order.
+    # Groups the new dataframe by category, sort order, colors and patterns. It then
+    # counts the number of subject ids in each group and sorts the values by sort order.
     obs_grp = (
         obs_grp.groupby(["category", "sort_order", "colors", "patterns"])["subjid"]
         .count()
@@ -214,14 +216,14 @@ def overlap_view_adults(
     xmin = math.floor(individual.age.min())
     xmax = math.ceil(individual.age.max())
     selected_param_plot.set_xlim(xmin, xmax)
-    if include_carry_forward == True:
+    if include_carry_forward is True:
         carry_forward = selected_param[
             selected_param.clean_value == "Exclude-Carried-Forward"
         ]
         selected_param_plot.scatter(
             x=carry_forward.age, y=carry_forward.measurement, c="c", marker="^"
         )
-    if include_percentiles == True:
+    if include_percentiles is True:
         if param == "WEIGHTKG":
             percentile_df = wt_df
         elif param == "BMI":
@@ -287,7 +289,7 @@ def overlap_view_adults_show(
     """
     Wraps overlap_view_adult with plt.show().
     """
-    plot = overlap_view_adults(
+    overlap_view_adults(
         obs_df,
         subjid,
         param,
@@ -345,14 +347,14 @@ def overlap_view_pediatrics(
         c="r",
         marker="x",
     )
-    if include_carry_forward == True:
+    if include_carry_forward is True:
         carry_forward = selected_param[
             selected_param.clean_value == "Exclude-Carried-Forward"
         ]
         selected_param_plot.scatter(
             x=carry_forward.age, y=carry_forward.measurement, c="c", marker="^"
         )
-    if include_percentiles == True:
+    if include_percentiles is True:
         percentile_df = wt_df if param == "WEIGHTKG" else ht_df
         percentile_window = percentile_df.loc[
             (percentile_df.Sex == individual.sex.min())
@@ -374,7 +376,7 @@ def overlap_view_pediatrics_show(
     """
     Wraps overlap_view_pediatrics with plt.show().
     """
-    plot = overlap_view_pediatrics(
+    overlap_view_pediatrics(
         obs_df, subjid, param, include_carry_forward, include_percentiles, wt_df, ht_df
     )
     plt.show()
@@ -442,7 +444,7 @@ def overlap_view_double_pediatrics(
     ax2.set_ylabel(
         "weight (kg)", color=color_secondary
     )  # we already handled the x-label with ax1
-    if include_percentiles == True:
+    if include_percentiles is True:
         percentile_window = wt_df.loc[wt_df.Sex == individual.sex.min()]
         ax2.plot(percentile_window.age, percentile_window.P5, color="lightblue")
         ax2.plot(
@@ -476,21 +478,21 @@ def overlap_view_double_pediatrics(
         )
         ax1.plot(percentile_window_ht.age, percentile_window_ht.P95, color="pink")
 
-    if show_all_measurements == True:
+    if show_all_measurements is True:
         ax1.plot(height["age"], height["measurement"], color=color, label="stature")
         ax2.plot(
             weight["age"], weight["measurement"], color=color_secondary, label="weight"
         )
 
-    if show_excluded_values == True:
+    if show_excluded_values is True:
         ax1.scatter(
             excluded_height.age, excluded_height.measurement, c="black", marker="x"
         )
         ax2.scatter(
             excluded_weight.age, excluded_weight.measurement, c="black", marker="x"
         )
 
-    if show_trajectory_with_exclusions == True:
+    if show_trajectory_with_exclusions is True:
         ax1.plot(
             included_height["age"],
             included_height["measurement"],
@@ -509,7 +511,7 @@ def overlap_view_double_pediatrics(
 
     fig.tight_layout()  # otherwise the right y-label is slightly clipped
 
-    if include_carry_forward == True:
+    if include_carry_forward is True:
         carry_forward_height = height[height.clean_value == "Exclude-Carried-Forward"]
         carry_forward_weight = weight[weight.clean_value == "Exclude-Carried-Forward"]
         ax1.scatter(
@@ -594,7 +596,7 @@ def five_by_five_view(obs_df, subjids, param, wt_df, ht_df, bmi_df, linestyle):
         for x in range(nrows):
             try:
                 subjid = subjids[x * 5 + y]
-            except IndexError as ie:
+            except IndexError:
                 # No more subjects to render
                 break
             individual = obs_df[obs_df.subjid == subjid]
@@ -758,15 +760,15 @@ def top_ten(
     in the notebook.
     """
     working_set = merged_df
-    if age != None:
+    if age is not None:
         working_set = working_set.loc[
             working_set.rounded_age.ge(age[0]) & working_set.rounded_age.le(age[1])
         ]
-    if sex != None:
+    if sex is not None:
         working_set = working_set[working_set.sex == sex]
-    if wexclusion != None:
+    if wexclusion is not None:
         working_set = working_set[working_set.weight_cat.isin(wexclusion)]
-    if hexclusion != None:
+    if hexclusion is not None:
         working_set = working_set[working_set.height_cat.isin(hexclusion)]
     # if order == 'largest':
     #   working_set = working_set.nlargest(10, field)
@@ -796,7 +798,7 @@ def top_ten(
             "BMIz",
         ]
     ]
-    if out == None:
+    if out is None:
         return working_set
     else:
         out.clear_output()
 
@@ -130,7 +130,7 @@ def subject_stats_comparison(combined_df):
     for rn in combined_df.run_name.unique():
         total_subjects = combined_df[combined_df.run_name == rn].subjid.nunique()
         only_exclusions = combined_df[
-            (combined_df.run_name == rn) & (combined_df.include == False)
+            (combined_df.run_name == rn) & (combined_df.include is False)
         ]
         percent_with_exclusion = (
             only_exclusions.subjid.nunique() / total_subjects
 
@@ -1,16 +1,15 @@
-from IPython.display import FileLinks
-import matplotlib as mpl
-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from IPython.display import FileLinks
 
 
 def setup_individual_obs_df(obs_df):
     """
     Standardizes adults and pediatrics files for clean processing in GrowthViz notebooks
 
     Parameters:
-    obs_df: (DataFrame) with subjid, sex, age, measurement, param and clean_value columns
+    obs_df: (DataFrame) with subjid, sex, age, measurement, param and clean_value
+        columns
 
     Returns:
     DataFrame with updated columns
@@ -135,16 +134,23 @@ def setup_percentiles_pediatrics(percentiles_file):
 
 def keep_age_range(df, mode):
     """
-    Returns specified age range
+    Returns specified age range, removing extraneous columns as well
 
     Parameters:
-    df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat, include,
-        category, colors, patterns, and sort_order columns
-    mode: (str) indicates whether you want the "adults" (18-80) or "pediatrics" (0-25) values
+    df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, and
+        include columns
+    mode: (str) indicates whether you want the "adults" (18-80) or "pediatrics" (0-25)
+        values
 
     Returns:
     DataFrame with filtered ages, unchanged if invalid mode is specified
     """
+    # Note: this is a side effect; just the simplest place to remove these
+    cols_to_drop = []
+    for extra_col in ["clean_cat", "category", "colors", "patterns", "sort_order"]:
+        if extra_col in df.columns:
+            cols_to_drop.append(extra_col)
+    df = df.drop(columns=cols_to_drop)
     if mode == "adults":
         return df[df["age"].between(18, 80, inclusive="both")]
     elif mode == "pediatrics":
@@ -158,7 +164,8 @@ def setup_merged_df(obs_df):
     Merges together weight and height data for calculating BMI
 
     Parameters:
-    obs_df: (DataFrame) with subjid, sex, age, measurement, param and clean_value columns
+    obs_df: (DataFrame) with subjid, sex, age, measurement, param and clean_value
+        columns
 
     Returns:
     DataFrame with merged data
@@ -206,7 +213,8 @@ def setup_merged_df(obs_df):
 
 def exclusion_information(obs):
     """
-    Provides a count and percentage of growthcleanr categories by measurement type (param).
+    Provides a count and percentage of growthcleanr categories by measurement type
+    (param).
 
     Parameters:
     obs: a DataFrame, in the format output by setup_individual_obs_df
@@ -238,15 +246,16 @@ def exclusion_information(obs):
 
 def label_incl(row):
     """
-    Categorizes BMI calculations as Include, Implausible, or unable to calculate (Only Wt or Ht)
+    Categorizes BMI calculations as Include, Implausible, or unable to calculate (Only
+    Wt or Ht)
 
     Parameters:
     row: (Series) dataframe row
 
     Returns:
     Category (str) for BMI calculation
     """
-    if row["include_both"] == True:
+    if row["include_both"] is True:
         return "Include"
     elif (row["weight_cat"] == "Implausible") | (row["height_cat"] == "Implausible"):
         return "Implausible"
@@ -261,8 +270,8 @@ def setup_bmi_adults(merged_df, obs):
     Parameters:
     merged_df: (DataFrame) with subjid, bmi, include_height, include_weight, rounded_age
                and sex columns
-    obs: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat, include,
-        category, colors, patterns, and sort_order columns
+    obs: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat,
+        include, category, colors, patterns, and sort_order columns
 
     Returns:
     DataFrame with appended values
@@ -324,15 +333,17 @@ def export_to_csv(da_locals, selection_widget, out):
 
 def clean_swapped_values(merged_df):
     """
-    This function will look in a DataFrame for rows where the height_cat and weight_cat are set to
-    "Swapped-Measurements" (or the adult equivalent). It will then swap the height and weight values
-    for those rows, and recalculate BMIs based on these changes. It will also create two new columns:
-    postprocess_height_cat and postprocess_weight_cat. The values for these columns are copied from
-    the original categories except in the case where swaps are fixed when it is set to
+    This function will look in a DataFrame for rows where the height_cat and weight_cat
+    are set to "Swapped-Measurements" (or the adult equivalent). It will then swap the
+    height and weight values for those rows, and recalculate BMIs based on these
+    changes.  It will also create two new columns: postprocess_height_cat and
+    postprocess_weight_cat. The values for these columns are copied from the original
+    categories except in the case where swaps are fixed when it is set to
     "Include-Fixed-Swap".
 
     Parameters:
-    merged_df: (DataFrame) with subjid, height, weight, include_height and include_weight columns
+    merged_df: (DataFrame) with subjid, height, weight, include_height and
+        include_weight columns
 
     Returns:
     The cleaned DataFrame
@@ -368,20 +379,21 @@ def clean_swapped_values(merged_df):
 
 def clean_unit_errors(merged_df):
     """
-    This function will look in a DataFrame for rows where the height_cat and weight_cat are set to
-    "Unit-Error-High" or "Unit-Error-Low". It will then multiply / divide the height and weight
-    values to convert them.  It will also create two new columns: postprocess_height_cat and
-    postprocess_weight_cat.  The values for these columns are copied from the original categories
-    except in the case where unit errors are fixed when it is set to "Include-UH" or "Include-UL"
-    respectively.
-
-    At present, the adult algorithm does not specify high or low unit errors, rather it only flags
-    "Exclude-Adult-Unit-Errors", so this function only works with pediatrics data. If growthcleanr
-    adds high and low designations for adult unit errors, a comparable set of conditions could be
-    added here to accommodate adult data.
+    This function will look in a DataFrame for rows where the height_cat and weight_cat
+    are set to "Unit-Error-High" or "Unit-Error-Low". It will then multiply / divide
+    the height and weight values to convert them.  It will also create two new columns:
+    postprocess_height_cat and postprocess_weight_cat.  The values for these columns
+    are copied from the original categories except in the case where unit errors are
+    fixed when it is set to "Include-UH" or "Include-UL" respectively.
+
+    At present, the adult algorithm does not specify high or low unit errors, rather it
+    only flags "Exclude-Adult-Unit-Errors", so this function only works with pediatrics
+    data. If growthcleanr adds high and low designations for adult unit errors, a
+    comparable set of conditions could be added here to accommodate adult data.
 
     Parameters:
-    merged_df: (DataFrame) with subjid, height, weight, include_height and include_weight columns
+    merged_df: (DataFrame) with subjid, height, weight, include_height and
+        include_weight columns
 
     Returns:
     The cleaned DataFrame
 
@@ -1,5 +1,5 @@
-from IPython.display import Markdown
 import numpy as np
+from IPython.display import Markdown
 
 
 def setup_percentile_zscore_adults(percentiles_clean):
@@ -169,7 +169,7 @@ def bmi_stats(
         merged_stats = merged_stats.rename(
             columns={"std_raw": "sd_raw", "std_clean": "sd_clean"}
         )
-    if out == None:
+    if out is None:
         return merged_stats
     else:
         # Clear output on first update and all subsequent updates, see
 
@@ -1,4 +1,5 @@
 ipywidgets~=7.0
+jupyter-server<2.0.0
 matplotlib>=3.3.4
 pandas>=1.2.2
 qgrid>=1.3.1
Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,7 @@ def subject_stats_comparison(combined_df):`
`130`	`130`	`for rn in combined_df.run_name.unique():`
`131`	`131`	`total_subjects = combined_df[combined_df.run_name == rn].subjid.nunique()`
`132`	`132`	`only_exclusions = combined_df[`
`133`		`- (combined_df.run_name == rn) & (combined_df.include == False)`
	`133`	`+ (combined_df.run_name == rn) & (combined_df.include is False)`
`134`	`134`	`]`
`135`	`135`	`percent_with_exclusion = (`
`136`	`136`	`only_exclusions.subjid.nunique() / total_subjects`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`ipywidgets~=7.0`
	`2`	`+jupyter-server<2.0.0`
`2`	`3`	`matplotlib>=3.3.4`
`3`	`4`	`pandas>=1.2.2`
`4`	`5`	`qgrid>=1.3.1`