Skip to content

Commit d6c1338

Browse files
add scripts for templating
1 parent b1e1492 commit d6c1338

File tree

2 files changed

+1296
-0
lines changed

2 files changed

+1296
-0
lines changed
+371
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
import pandas as pd
2+
import numpy as np
3+
import io
4+
import json
5+
from BBQ_Full.utils import *
6+
import random
7+
import re
8+
import ast
9+
10+
# all possible values of cat
11+
cats = [
12+
"Disability_status",
13+
"Age",
14+
"Physical_appearance",
15+
"SES",
16+
"Gender_identity",
17+
"Race_ethnicity",
18+
"Religion",
19+
"Nationality",
20+
"Sexual_orientation",
21+
]
22+
23+
# read in vocabulary files
24+
vocab = pd.read_csv("BBQ_full/templates/vocabulary.csv")
25+
vocab = vocab[vocab.Pilot_include != "No"]
26+
names_vocab = pd.read_csv("BBQ_full/templates/vocabulary_proper_names.csv")
27+
28+
unknown_options = [
29+
"Unknown",
30+
"Cannot be determined",
31+
"Can't be determined",
32+
"Not answerable",
33+
"Not known",
34+
"Not enough info",
35+
"Not enough information",
36+
"Cannot answer",
37+
"Can't answer",
38+
"Undetermined",
39+
]
40+
41+
# these have items that need to have the argument order manually reversed
42+
need_stereotyping_subset = [
43+
"Race_ethnicity",
44+
"Religion",
45+
"Nationality",
46+
"Sexual_orientation",
47+
]
48+
49+
# big loop, does everything
50+
for cat in cats:
51+
frames = pd.read_csv(
52+
"BBQ_full/templates/new_templates - %s.csv" % cat, na_filter=False
53+
) # get the right template file
54+
55+
dat_file = io.open(
56+
"BBQ_full/data/%s.jsonl" % cat, "w"
57+
) # open the file that all the data will be saved in
58+
59+
the_frames = frames[
60+
frames.Ambiguous_Context != ""
61+
].reset_index() # get rid of ones that are empty
62+
frame_cols = the_frames.columns
63+
64+
# initialize item id counter
65+
nn = 0
66+
67+
for i in range(len(the_frames)):
68+
69+
# by default, take the words from the vocab that match this category
70+
# this needs to be done at each iteration because some rows use custom vocab and some don't
71+
words = vocab[vocab.Category == cat]
72+
73+
# make sure that one of the words used is the target of the bias
74+
possible_word_list = words.Name.unique().tolist()
75+
bias_targets = the_frames.Known_stereotyped_groups[i]
76+
77+
# Stereotyped groups info is not relevant for subsetting Names in some categories
78+
# so only use that for creating the targeted word list when the info is provided, otherwise all possible
79+
# vocab items are targets
80+
if (len(bias_targets) > 1) and (cat in need_stereotyping_subset):
81+
bias_targets = ast.literal_eval(the_frames.Known_stereotyped_groups[i])
82+
targeted_word_list = [x for x in possible_word_list if x in bias_targets]
83+
else:
84+
targeted_word_list = possible_word_list
85+
bias_targets = ast.literal_eval(the_frames.Known_stereotyped_groups[i])
86+
87+
# if the list of bias targets is too big, downsample
88+
if len(targeted_word_list) > 4:
89+
word_list = random.sample(targeted_word_list, 5)
90+
elif len(possible_word_list) < 2: # these will be handled later
91+
word_list = []
92+
else:
93+
word_list = targeted_word_list
94+
95+
# set really helpful boolean
96+
# This determines if the NAME1 and NAME2 needs to be a proper name or not
97+
if "Proper_nouns_only" in frame_cols:
98+
if the_frames.Proper_nouns_only[i] == "TRUE":
99+
has_proper_name = True
100+
else:
101+
has_proper_name = False
102+
else:
103+
has_proper_name = False
104+
105+
# if the word list needs to be further pulled apart within subcategory, do that here
106+
if "Subcategory" in frame_cols:
107+
this_subcat = the_frames.Subcategory[i]
108+
if len(this_subcat) > 1:
109+
words = words[words.SubCat == this_subcat]
110+
word_list = words.Name.unique().tolist()
111+
if len(word_list) > 5:
112+
word_list = random.sample(word_list, 5) # for downsampling
113+
else:
114+
this_subcat = "None"
115+
else:
116+
this_subcat = "None"
117+
118+
# check if there's vocab in the Names column that needs to be used
119+
# info in the Names column always overrides vocab that would otherwise be used
120+
if "Names" in frame_cols:
121+
critical_words = the_frames.Names[i]
122+
if len(critical_words) > 1:
123+
word_list, new_word_list = return_list_from_string(critical_words)
124+
else: # still need to initialize this variable if it's one of the cats w/out "Names"
125+
critical_words = ""
126+
127+
# check if this one needs to use proper names
128+
if has_proper_name:
129+
# what to do if this is part of the race_ethnicity subset
130+
# this needs to pull from the set of first names that have ethnicity information about them
131+
if cat == "Race_ethnicity":
132+
if len(bias_targets) >= 1:
133+
first_names_full = names_vocab[names_vocab.First_last == "first"]
134+
first_names = first_names_full[
135+
first_names_full.ethnicity.isin(bias_targets)
136+
]
137+
else:
138+
first_names = names_vocab[names_vocab.First_last == "first"]
139+
word_list = first_names.Name.tolist()
140+
word_list = random.sample(word_list, 5) # for downsampling
141+
# what to do if this is part of the gender_identity subset
142+
# this needs to pull from the set of first names that do not have ethnicity information
143+
elif cat == "Gender_identity":
144+
first_names = names_vocab[names_vocab.First_last == "first_only"]
145+
# specify female so that the other NAME variable can be set to male and there's always a contrast
146+
female_first_names = first_names[first_names.gender == "F"]
147+
word_list = female_first_names.Name.tolist()
148+
word_list = random.sample(word_list, 5) # for downsampling
149+
# a handful of templates in other categories use proper nouns
150+
# these don't have to be specific to gender, but should not be ethnically identifying, so pull from the
151+
# subset that doesn't correlate strongly to one racial/ethnic category
152+
else:
153+
first_names = names_vocab[names_vocab.First_last == "first_only"]
154+
word_list = first_names.Name.tolist()
155+
word_list = random.sample(word_list, 6) # for downsampling
156+
157+
# iterate over each word in word_list
158+
for j in range(len(word_list)):
159+
# initialize variables for later
160+
Name1_info = None
161+
Name2_info = None
162+
rand_wrd1 = ""
163+
rand_wrd2 = ""
164+
Name2_info_dict = {}
165+
166+
# current target word
167+
this_word = word_list[j]
168+
169+
# only create new_word_list here if it wasn't already created through Names column
170+
if len(critical_words) < 2:
171+
if (len(bias_targets) > 0) and (cat in need_stereotyping_subset):
172+
new_word_list = [
173+
n for n in possible_word_list if n not in bias_targets
174+
]
175+
if len(new_word_list) > 4:
176+
new_word_list = random.sample(
177+
new_word_list, 5
178+
) # downsample when needed
179+
else:
180+
new_word_list = [n for n in word_list if n != this_word]
181+
# for SES, need to check to make sure it's grabbing from the other (low/high) category
182+
if cat == "SES" and not has_proper_name:
183+
the_word_cat = words.loc[words["Name"] == this_word, "Info"].iloc[0]
184+
Name1_info = the_word_cat
185+
new_options = words[words.Info != the_word_cat]
186+
new_word_list = new_options.Name.unique().tolist()
187+
# for gender, keep track of gender of the name picked and select a name of a different gender for the
188+
# second name variable
189+
if cat == "Gender_identity" and has_proper_name:
190+
the_word_gender = first_names.loc[
191+
first_names["Name"] == this_word, "gender"
192+
].iloc[0]
193+
Name1_info = the_word_gender
194+
new_options = first_names[first_names.gender != the_word_gender]
195+
new_word_list = new_options.Name.unique().tolist()
196+
new_word_list = random.sample(new_word_list, 5) # for downsampling
197+
# for race/ethnicity...
198+
if cat == "Race_ethnicity" and has_proper_name:
199+
# keep track of gender and ethnicity of the name in question
200+
the_word_gender = first_names.loc[
201+
first_names["Name"] == this_word, "gender"
202+
].iloc[0]
203+
the_word_eth = first_names.loc[
204+
first_names["Name"] == this_word, "ethnicity"
205+
].iloc[0]
206+
Name1_info = the_word_gender + "-" + the_word_eth
207+
# select a last name that matches ethnicity, then put them together
208+
last_names = names_vocab[
209+
(names_vocab.First_last == "last")
210+
& (names_vocab.ethnicity == the_word_eth)
211+
]
212+
last_names_list = last_names.Name.unique().tolist()
213+
this_last_name = random.choice(last_names_list)
214+
this_word = this_word + " " + this_last_name
215+
# create a list of names for the second name var w/ same gender
216+
# but ethnicity that's not one of the bias targets for that template
217+
other_first_names = first_names_full[
218+
(~first_names_full.ethnicity.isin(bias_targets))
219+
& (first_names_full.gender == the_word_gender)
220+
].reset_index()
221+
other_first_names = other_first_names.sample(
222+
n=5, replace=False
223+
).reset_index() # for downsampling
224+
other_last_names = names_vocab[
225+
(names_vocab.First_last == "last")
226+
& (~names_vocab.ethnicity.isin(bias_targets))
227+
].reset_index()
228+
# initialize list and dict to keep track of the second name variables to create
229+
new_word_list = []
230+
Name2_info_dict = {}
231+
for nam in range(len(other_first_names)):
232+
frst = other_first_names.Name[nam]
233+
eth = other_first_names.ethnicity[nam]
234+
gen = other_first_names.gender[nam]
235+
Name2_info = gen + "-" + eth
236+
lst_list = (
237+
other_last_names[other_last_names.ethnicity == eth]
238+
.Name.unique()
239+
.tolist()
240+
)
241+
lst = random.choice(lst_list)
242+
new_word_list.append(frst + " " + lst)
243+
Name2_info_dict[frst + " " + lst] = Name2_info
244+
245+
# iterate over each word in the second word list
246+
# results in full pairings of every possible vocab pair within the subset
247+
for k in range(len(new_word_list)):
248+
this_word_2 = new_word_list[k]
249+
this_frame_row = the_frames.iloc[[i]].reset_index()
250+
lex_div = this_frame_row.Lexical_diversity[0]
251+
252+
# Only need to create these values when there's text in lexical diversity
253+
if len(lex_div) > 1:
254+
wrdlist1, wrdlist2 = return_list_from_string(lex_div)
255+
rand_wrd1 = random.choice(wrdlist1)
256+
if len(wrdlist2) > 1: # sometimes there's not a WORD2
257+
rand_wrd2 = random.choice(wrdlist2)
258+
259+
# replace frame text info with value of {{NAME}} and {{WORD}}. Check the value for each column
260+
new_frame_row = do_slotting(
261+
this_frame_row,
262+
frame_cols,
263+
this_word,
264+
None,
265+
this_word_2,
266+
None,
267+
lex_div,
268+
rand_wrd1,
269+
rand_wrd2,
270+
)
271+
272+
# need to record info about the names that were used for easier analysis later
273+
if (
274+
Name1_info is not None
275+
and cat == "Race_ethnicity"
276+
and has_proper_name
277+
):
278+
Name2_info = Name2_info_dict[this_word_2]
279+
elif cat == "Gender_identity" and has_proper_name:
280+
try:
281+
Name2_info = first_names.loc[
282+
first_names["Name"] == this_word_2, "gender"
283+
].iloc[0]
284+
except IndexError:
285+
Name2_info = this_word_2
286+
if this_frame_row.NAME1_info[0] != "":
287+
# in some cases, there info both from the name selected and info tracked in an info column
288+
# in the template. Need to save both of these pieces of info
289+
if k == 0:
290+
Name1_info = this_frame_row.NAME1_info[0] + "_" + Name1_info
291+
Name2_info = this_frame_row.NAME2_info[0] + "_" + Name2_info
292+
elif cat == "Nationality" or (
293+
cat == "SES" and this_subcat == "Occupation"
294+
):
295+
# need to get the relevant info about the name from the vocab file
296+
Name1_info = vocab.loc[vocab["Name"] == this_word, "Info"].iloc[0]
297+
Name2_info = vocab.loc[vocab["Name"] == this_word_2, "Info"].iloc[0]
298+
elif "NAME1_info" in frame_cols:
299+
# for when the info about the name variables is stored in the templates
300+
if this_frame_row.NAME1_info[0] != "":
301+
Name1_info = this_frame_row.NAME1_info[0]
302+
Name2_info = this_frame_row.NAME2_info[0]
303+
else:
304+
Name1_info = this_word
305+
Name2_info = this_word_2
306+
else:
307+
# if none of the above apply, just store the info as the actual string used in the name
308+
Name1_info = this_word
309+
Name2_info = this_word_2
310+
311+
# create four sets of data, each as a dictionary
312+
dat_formatted = create_templating_dicts(
313+
cat,
314+
new_frame_row,
315+
this_subcat,
316+
unknown_options,
317+
frame_cols,
318+
bias_targets,
319+
this_word,
320+
this_word_2,
321+
Name1_info,
322+
Name2_info,
323+
nn,
324+
)
325+
nn += 4
326+
for item in dat_formatted:
327+
dat_file.write(json.dumps(item, default=str))
328+
dat_file.write("\n")
329+
dat_file.flush()
330+
331+
if (
332+
(cat in need_stereotyping_subset)
333+
or has_proper_name
334+
or (cat == "SES" and this_subcat == "Occupation")
335+
):
336+
# flip input of this_word and this_word_2
337+
new_frame_row = do_slotting(
338+
this_frame_row,
339+
frame_cols,
340+
this_word_2,
341+
None,
342+
this_word,
343+
None,
344+
lex_div,
345+
rand_wrd1,
346+
rand_wrd2,
347+
)
348+
349+
# create four sets of data, each as a dictionary
350+
dat_formatted = create_templating_dicts(
351+
cat,
352+
new_frame_row,
353+
this_subcat,
354+
unknown_options,
355+
frame_cols,
356+
bias_targets,
357+
this_word_2,
358+
this_word,
359+
Name2_info,
360+
Name1_info,
361+
nn,
362+
)
363+
nn += 4
364+
for item in dat_formatted:
365+
dat_file.write(json.dumps(item, default=str))
366+
dat_file.write("\n")
367+
dat_file.flush()
368+
369+
print("generated %s sentences total for %s" % (str(nn), cat))
370+
371+
dat_file.close()

0 commit comments

Comments
 (0)