Skip to content

Commit 46eb2a7

Browse files
authored
Merge pull request #2663 from sciencehistory/add_ocr_to_more_works
Add OCR to some more works
2 parents 05429e6 + 306905a commit 46eb2a7

File tree

1 file changed

+272
-0
lines changed

1 file changed

+272
-0
lines changed
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
namespace :scihist do
2+
namespace :data_fixes do
3+
desc """
4+
Set ocr_requested to true for a new batch of works (june 2024)
5+
rake scihist:data_fixes:set_ocr_requested_june_2024
6+
7+
Do just one:
8+
rake scihist:data_fixes:set_ocr_requested_june_2024['aso1ecq']
9+
"""
10+
task :set_ocr_requested_june_2024, [:work_friendlier_id] => :environment do |t, args|
11+
12+
items_needing_ocr = [
13+
'00000059p',
14+
'10el2db',
15+
'1abq6cv',
16+
'1ndf3hf',
17+
'1v53jx89f',
18+
'1x93vdg',
19+
'1z40kt41h',
20+
'1zshph4',
21+
'20kxw7k',
22+
'25dyk6p',
23+
'27mo804',
24+
'2z10wr23f',
25+
'37720d24n',
26+
'3dbh0ng',
27+
'3dc72as',
28+
'3j333301b',
29+
'3j3333398',
30+
'3n203z173',
31+
'3nz58ef',
32+
'3r074v594',
33+
'41687j69c',
34+
'46smfc1',
35+
'4b29b690d',
36+
'4dcl1aq',
37+
'4hml3f5',
38+
'4qlor3l',
39+
'4w5cwnu',
40+
'4x51hj81r',
41+
'50cnjm9',
42+
'58s4wy3',
43+
'5999n455n',
44+
'5999n456x',
45+
'5h73pw14h',
46+
'5l15b5i',
47+
'5m60qs97h',
48+
'5sgmjgh',
49+
'5w3x3vu',
50+
'6108vb34k',
51+
'6d56zx16p',
52+
'6hz4gzn',
53+
'6t053h17b',
54+
'6w924c48b',
55+
'73x6up6',
56+
'7fmih3e',
57+
'7g7sykv',
58+
'7hlt38t',
59+
'7kssjdm',
60+
'7p05mi5',
61+
'7r9fbhw',
62+
'7ucpb1m',
63+
'7uk1dod',
64+
'7w62f955m',
65+
'8910jv725',
66+
'8c97kq63g',
67+
'8g84mm76s',
68+
'8lelsrz',
69+
'8r00347',
70+
'8vd3kbc',
71+
'9306sz691',
72+
'9880vr88p',
73+
'9ag3q0u',
74+
'9c67wn74v',
75+
'9chrkef',
76+
'9hu7g0h',
77+
'9k41zd82t',
78+
'9s161738c',
79+
'9w0323862',
80+
'a25w764',
81+
'actytlk',
82+
'ak8mcz4',
83+
'akxmjsq',
84+
'aso1ecq',
85+
'asx7t5o',
86+
'awm78yy',
87+
'b2773w23c',
88+
'bg257g44c',
89+
'bgd84t5',
90+
'blz0d1e',
91+
'brpy6h2',
92+
'by8cen9',
93+
'bzbzq6p',
94+
'c26dktp',
95+
'c5hekt0',
96+
'c5ri8xu',
97+
'c73lqd6',
98+
'c7jixao',
99+
'cf1g27k',
100+
'cn69m481p',
101+
'cnlv4q9',
102+
'dh8qks4',
103+
'dnel7ah',
104+
'dq1vcry',
105+
'drkkqip',
106+
'dwh6udb',
107+
'dxxyzo5',
108+
'dzskowv',
109+
'ej5g6p0',
110+
'f1881n22k',
111+
'f3vunv2',
112+
'fb494960s',
113+
'fc9f8sh',
114+
'fhiw36j',
115+
'fk7gfc7',
116+
'g4f4xrr',
117+
'gh93h0704',
118+
'gt54kn070',
119+
'guxwfm4',
120+
'gx41mk00h',
121+
'gzh9c7d',
122+
'h128nf12n',
123+
'h2wpjqz',
124+
'h415pb65x',
125+
'h702q750j',
126+
'hcxstvh',
127+
'hd76s099h',
128+
'hh63sw38v',
129+
'hhohjvz',
130+
'hm50ts226',
131+
'hm50ts49m',
132+
'hmo12zt',
133+
'hnaq4i6',
134+
'hqr0vep',
135+
'i3ntrqa',
136+
'if1y8jh',
137+
'ip5xqsz',
138+
'iqir275',
139+
'j098zb837',
140+
'j7eqggo',
141+
'j965jda',
142+
'jd472x647',
143+
'jq085m03h',
144+
'jswrvtg',
145+
'juzgqqv',
146+
'jxn9x1v',
147+
'k06988730',
148+
'k4424ph',
149+
'kh04dq16g',
150+
'kjzq5sx',
151+
'kw52j813n',
152+
'kw52j944v',
153+
'lfk1l7p',
154+
'li2hswd',
155+
'liut8yp',
156+
'm039k620t',
157+
'm326m2699',
158+
'mkicy9w',
159+
'mkmg3gr',
160+
'ml2d1fx',
161+
'ms35t945j',
162+
'ms35t9537',
163+
'mw22v6127',
164+
'ng451j669',
165+
'nk61zfx',
166+
'nlp95ls',
167+
'no6jc8u',
168+
'nqee13h',
169+
'oevh9ff',
170+
'ohfepv5',
171+
'oxczuzg',
172+
'p8xwtv7',
173+
'pg6ay55',
174+
'pk02cb51k',
175+
'pn89d780h',
176+
'ppze54g',
177+
'pr8v5c1',
178+
'px4ksww',
179+
'q811kk83q',
180+
'qc0geu2',
181+
'qjvy4hr',
182+
'qkbcb6u',
183+
'qn59q441g',
184+
'r08cysh',
185+
'r207tp471',
186+
'rcgynxk',
187+
'rcofrgk',
188+
's28p4ir',
189+
's7526c98r',
190+
'sffnt1m',
191+
'sieyazl',
192+
'sj139301v',
193+
'st74cr668',
194+
't0xt4ls',
195+
'tb09j693r',
196+
'th83m023h',
197+
'torh4pp',
198+
'u1cnmj1',
199+
'u4pvoiq',
200+
'ukvpxbo',
201+
'uoeae9x',
202+
'ut99im9',
203+
'uwp76sh',
204+
'uye351l',
205+
'v405s9778',
206+
'v979v413n',
207+
'vh53ww37b',
208+
'vt150j93m',
209+
'vwn5gsm',
210+
'vzn1kaq',
211+
'w37637980',
212+
'wd375w35b',
213+
'we6qich',
214+
'whpeetf',
215+
'wkrtx7g',
216+
'wmlkiom',
217+
'wp988m09w',
218+
'ws3za03',
219+
'wsa6nnu',
220+
'wtl6m8z',
221+
'wxtwzjc',
222+
'x0sn5hu',
223+
'x633f153j',
224+
'x6rar31',
225+
'x70vejb',
226+
'x78peme',
227+
'xd07gs80t',
228+
'xd07gt25z',
229+
'xpej45y',
230+
'xw42n881j',
231+
'y11535o',
232+
'yasty4z',
233+
'yv8v2uh',
234+
'z029p529v',
235+
'z82jpz5',
236+
'zc3osyv',
237+
'zugqcua',
238+
]
239+
240+
items_needing_ocr = [args[:work_friendlier_id]] if args[:work_friendlier_id].present?
241+
242+
total = items_needing_ocr.count
243+
progress_bar = ProgressBar.create(total: total, format: Kithe::STANDARD_PROGRESS_BAR_FORMAT)
244+
ocr_enqueued_count = 0
245+
Kithe::Indexable.index_with(batching: true) do
246+
items_needing_ocr.each do |work_id|
247+
work = Work.find_by_friendlier_id(work_id)
248+
249+
if work.nil?
250+
progress_bar.log("Couldn't find work #{work_id}")
251+
next
252+
end
253+
254+
255+
if work.ocr_requested?
256+
progress_bar.log("SKIPPING: Already turned on for #{work_id}")
257+
progress_bar.increment
258+
next
259+
end
260+
261+
work.ocr_requested = true
262+
work.save!
263+
WorkOcrCreatorRemoverJob.set(queue: "special_jobs").perform_later(work)
264+
puts "\nOCR enqueued for #{work_id}."
265+
ocr_enqueued_count += 1
266+
progress_bar.increment
267+
end
268+
end
269+
puts "\nOCR enqueued for #{ocr_enqueued_count} works"
270+
end
271+
end
272+
end

0 commit comments

Comments
 (0)