Skip to content

Commit a2847a7

Browse files
authored
Merge pull request #75 from UAL-RE/66-keep-track-of-how-many-items-were-successfully-bagged-new
Feat: Keep track of items successfully processed (Issue #66)
2 parents b348a96 + 99b5fc8 commit a2847a7

File tree

5 files changed

+92
-21
lines changed

5 files changed

+92
-21
lines changed

Log.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ def __init__(self, config):
2323
self.file_path = log_location + file_name
2424

2525
self.ansi_terminal = _check_ansi()
26+
self.warnings_count = 0
27+
self.errors_count = 0
2628

2729
def log_config(self, in_terminal: bool = False):
2830
if (in_terminal):
@@ -37,13 +39,15 @@ def log_config(self, in_terminal: bool = False):
3739
def show_log_in_terminal(self, type, message, stop_script=False):
3840
# Show log in terminal
3941
self.log_config(True)
42+
self._count_errorwarning(type)
4043
self.message(type, message)
4144
if (stop_script is True):
4245
exit()
4346

4447
def write_log_in_file(self, type, message, show_in_terminal=False, stop_script=False):
4548
# Show log in file
4649
self.log_config(False)
50+
self._count_errorwarning(type)
4751
if (show_in_terminal is True):
4852
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3] + ":" + self._format_messagetype_ansi(type.upper()) + ": " + message)
4953
self.message(type, message)
@@ -72,6 +76,15 @@ def message(self, type, message):
7276
logger.error(message)
7377
del logger
7478

79+
def _count_errorwarning(self, msgtype):
80+
'''
81+
Counts how many times a message type (string) of warning or error is passed in
82+
'''
83+
if msgtype.lower() == 'warning':
84+
self.warnings_count += 1
85+
if msgtype.lower() == 'error':
86+
self.errors_count += 1
87+
7588
def _format_messagetype_ansi(self, type):
7689
'''
7790
Returns a colorized version of the given message type string. If no ANSI support is detected, the same string is returned unchanged.

app.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,14 @@ def main():
143143
True)
144144
article_obj = Article(config_file_path, log, args.ids)
145145
article_data = article_obj.get_articles()
146+
147+
articles_count = 0
148+
articles_versions_count = 0
149+
for i, (k, v) in enumerate(article_data.items()):
150+
articles_count += 1
151+
articles_versions_count += len(v)
146152
log.write_log_in_file('info',
147-
f"Total articles fetched: {len(article_data)}.",
153+
f"Total articles fetched: {len(article_data)}. Total articles versions fetched: {articles_versions_count}.",
148154
True)
149155
print(" ")
150156

@@ -153,15 +159,45 @@ def main():
153159
True)
154160
collection_obj = Collection(config_file_path, log, args.ids)
155161
collection_data = collection_obj.get_collections()
162+
163+
collections_count = 0
164+
collections_versions_count = 0
165+
for i, (k, v) in enumerate(collection_data.items()):
166+
collections_count += 1
167+
collections_versions_count += len(v['versions'])
156168
log.write_log_in_file('info',
157-
f"Total collections fetched: {len(collection_data)}.",
169+
f"Total collections fetched: {collections_count}. Total collections versions fetched: {collections_versions_count}.",
158170
True)
159171
print(" ")
160172

161173
# Start articles processing after completing fetching data from API
162-
article_obj.process_articles(article_data, article_obj.total_all_articles_file_size)
174+
processed_articles_versions_count = article_obj.process_articles(article_data)
163175

164176
# Start collections processing after completing fetching data from API and articles processing.
165-
collection_obj.process_collections(collection_data)
177+
processed_collections_versions_count = collection_obj.process_collections(collection_data)
178+
179+
log.write_log_in_file('info',
180+
"Total articles versions processed/fetched: \t\t\t"
181+
+ f'{processed_articles_versions_count} / {articles_versions_count}',
182+
True)
183+
log.write_log_in_file('info',
184+
"Total processed articles bags already in preservation storage: \t"
185+
+ f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}',
186+
True)
187+
log.write_log_in_file('info',
188+
"Total collections versions processed/fetched: \t\t\t"
189+
+ f'{processed_collections_versions_count} / {collections_versions_count}',
190+
True)
191+
log.write_log_in_file('info',
192+
"Total processed collections bags already in preservation storage: "
193+
+ f'{collection_obj.processor.duplicate_bag_in_preservation_storage_count}',
194+
True)
166195

167-
log.write_log_in_file('info', "ReBACH script has successfully finished.", True, True)
196+
if processed_articles_versions_count != articles_versions_count or processed_collections_versions_count != collections_versions_count:
197+
log.write_log_in_file('warning',
198+
'The number of articles versions or collections versions sucessfully processed is different'
199+
+ ' than the number fetched. Check the log for details.', True)
200+
201+
log.write_log_in_file('info',
202+
f"ReBACH finished with {log.warnings_count} warnings and {log.errors_count} errors",
203+
True)

figshare/Article.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -738,10 +738,13 @@ def find_matched_articles(self, articles):
738738

739739
self.logs.write_log_in_file("info", f"Total matched unique articles: {len(set(matched_articles))}.", True)
740740
self.logs.write_log_in_file("info", f"Total unmatched unique articles: {len(set(unmatched_articles))}.", True)
741-
742741
self.logs.write_log_in_file("info", f"Total matched article versions: {no_matched}.", True)
743742
self.logs.write_log_in_file("info", f"Total unmatched article versions: {len(self.article_non_match_info)}.", True)
744743

744+
if len(set(unmatched_articles)) > 0 or len(self.article_non_match_info) > 0:
745+
self.logs.write_log_in_file("warning", "There were unmatched articles or article versions."
746+
+ f"Check {self.curation_storage_location} for each of the unmatched items.", True)
747+
745748
return article_data
746749

747750
"""
@@ -761,7 +764,7 @@ def __can_copy_files(self, version_data):
761764
return copy_files
762765

763766
"""
764-
Final process for matched articles.
767+
Final process for matched articles. Returns True if succeeded.
765768
"""
766769
def __final_process(self, check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process):
767770
success = True
@@ -798,14 +801,19 @@ def __final_process(self, check_files, copy_files, check_dir, version_data, fold
798801
self.logs.write_log_in_file("error",
799802
f"{version_data['id']} version {version_data['version']} - Post-processing script failed.",
800803
True)
804+
success = False
805+
else:
806+
success = True
801807
else:
802808
self.logs.write_log_in_file("info",
803809
f"No further processing for {version_data['id']} version {version_data['version']} due to errors.",
804810
True)
811+
success = False
805812
else:
806813
# if download process has any errors then delete complete folder
807814
self.logs.write_log_in_file("info", "Download process had an error so complete folder is being deleted.", True)
808815
self.delete_folder(check_dir)
816+
success = False
809817
else:
810818
if check_files or copy_files:
811819
if success:
@@ -815,12 +823,18 @@ def __final_process(self, check_files, copy_files, check_dir, version_data, fold
815823
self.logs.write_log_in_file("error",
816824
f"{version_data['id']} version {version_data['version']} - Post-processing script failed.",
817825
True)
826+
success = False
827+
else:
828+
success = True
818829
else:
819830
self.logs.write_log_in_file("info",
820831
f"No further processing for {version_data['id']} version {version_data['version']} due to errors.",
821832
True)
833+
success = False
822834
else:
823835
self.logs.write_log_in_file("error", "Unexpected condidion in final processing. No further actions taken.", True)
836+
success = False
837+
return success
824838

825839
"""
826840
Called before articles processing.
@@ -839,9 +853,10 @@ def __initial_process(self):
839853
return curation_storage_location
840854

841855
"""
842-
Process all articles after fetching from API.
856+
Process all articles after fetching from API. Returns the number of successfully processed articles.
843857
"""
844-
def process_articles(self, articles, total_file_size):
858+
def process_articles(self, articles):
859+
processed_count = 0
845860
curation_storage_location = self.__initial_process()
846861
self.logs.write_log_in_file("info", "Finding matched articles.", True)
847862
article_data = self.find_matched_articles(articles)
@@ -859,7 +874,7 @@ def process_articles(self, articles, total_file_size):
859874

860875
required_space = curation_folder_size + self.total_all_articles_file_size
861876

862-
self.logs.write_log_in_file("info", f"Total size of aritcles to be processed: {self.total_all_articles_file_size} bytes", True)
877+
self.logs.write_log_in_file("info", f"Total size of articles to be processed: {self.total_all_articles_file_size} bytes", True)
863878
self.logs.write_log_in_file("info", f"Total size of the curated folders for the matched articles: {curation_folder_size} bytes", True)
864879
self.logs.write_log_in_file("info", f"Total space required: {required_space} bytes", True)
865880

@@ -905,19 +920,22 @@ def process_articles(self, articles, total_file_size):
905920
self.logs.write_log_in_file("error", f"{version_data['id']} version {version_data['version']} - "
906921
+ "Post-processing script error found.", True)
907922
break
923+
908924
# end check main folder exists in preservation storage.
909925
# check required files exist in curation UAL_RDM folder
910926
self.logs.write_log_in_file("info", "Checking required files exist in associated curation "
911927
+ f"folder {curation_storage_location}.", True)
912928
copy_files = self.__can_copy_files(version_data)
913-
self.__final_process(check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process)
929+
if self.__final_process(check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process):
930+
processed_count += 1
914931
else:
915932
self.logs.write_log_in_file("error", "Pre-processing script failed. Running post-processing script.", True)
916933
# call post process script function for each matched item.
917934
value_post_process = self.processor.post_process_script_function("Article", check_dir, value_pre_process)
918935
if (value_post_process != 0):
919936
self.logs.write_log_in_file("error", f"{version_data['id']} version {version_data['version']} - "
920937
+ "Post-processing script failed.", True)
938+
return processed_count
921939

922940
"""
923941
Preservation and Curation directory access check while processing.

figshare/Collection.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,10 +235,11 @@ def get_article_api_url(self, collection):
235235
return coll_articles_api
236236

237237
"""
238-
Function to process collections and its articles with collection versions.
238+
Function to process collections and its articles with collection versions. Returns the number of successfully processed collections.
239239
:param collections object
240240
"""
241241
def process_collections(self, collections):
242+
processed_count = 0
242243
self.logs.write_log_in_file("info", "Processing collections.", True)
243244
for collection in collections:
244245
data = collections[collection]
@@ -251,12 +252,15 @@ def process_collections(self, collections):
251252
author_name = re.sub("[^A-Za-z0-9]", "_", version['authors'][0]['full_name'])
252253
folder_name = str(collection) + "_" + version_no + "_" + author_name + "_" + version_md5 + "/" + version_no + "/METADATA"
253254
version["articles"] = articles
254-
self.logs.write_log_in_file("info", f"Processing collection {collection} version {version['version']}.", True)
255+
self.logs.write_log_in_file("info", f"------- Processing collection {collection} version {version['version']}.", True)
255256
self.__save_json_in_metadata(collection, version, folder_name)
256257
collection_preservation_path = self.preservation_storage_location + os.path.basename(os.path.dirname(os.path.dirname(folder_name)))
257258
value_post_process = self.processor.post_process_script_function("Collection", collection_preservation_path)
258259
if (value_post_process != 0):
259260
self.logs.write_log_in_file("error", f"collection {collection} - post-processing script failed.", True)
261+
else:
262+
processed_count += 1
263+
return processed_count
260264

261265
"""
262266
Save json data for each collection version in related directory

figshare/Integration.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,17 @@ def __init__(self, config: Config, log: Log):
1919
"""
2020
self._config = config
2121
self._rebachlogger = log
22+
self.duplicate_bag_in_preservation_storage_count = 0
2223

2324
"""
2425
Post-processing script command function.
2526
"""
2627
def post_process_script_function(self, *args):
2728
"""
28-
Execute a post-processing script on an article or collection package and return the result.
29-
3029
If the 'post_process_script_command' value in the configuration file is set to 'Bagger', this function
31-
will execute the post-processing script internally. Otherwise, it will expect the value to be a path
32-
to an external script, which will be called.
30+
will execute. Returns the return code of bagger.
31+
32+
Otherwise, The code will expect the value to be a path to an external script, which will be called *TODO*.
3333
3434
:param args: Variable-length arguments passed to the function.
3535
args[0]: 'Article' or 'Collection' to indicate whether the function is called from an Article or Collection.
@@ -90,13 +90,13 @@ def post_process_script_function(self, *args):
9090
self._rebachlogger.write_log_in_file("info", f"Exit code: {status}.", True)
9191
if (status == 0):
9292
self._rebachlogger.write_log_in_file("info", f"Preservation package '{preservation_package_name}' processed successfully", True)
93-
return 0
9493
elif (status == 3):
94+
# code 3 is special since we don't want to cause the calling code to interpret duplicates as an error since it will happen a lot
9595
self._rebachlogger.write_log_in_file("warning", f"'{preservation_package_name}' already exists in "
9696
+ f"{config['Wasabi']['host']}/{config['Wasabi']['bucket']}. File not uploaded.", True)
97-
return 0
98-
else:
99-
return status
97+
self.duplicate_bag_in_preservation_storage_count += 1
98+
status = 0
99+
return status
100100
else:
101101
self._rebachlogger.write_log_in_file("info",
102102
f"[not implemented] Executing post-processing script Command: {post_process_script_command}.", True)

0 commit comments

Comments
 (0)