internetarchive · cdrini · Jun 18, 2024 · Jul 24, 2023 · Jun 13, 2024 · Jun 13, 2024
diff --git a/openlibrary/core/community_batch_imports.py b/openlibrary/core/community_batch_imports.py
@@ -0,0 +1,140 @@
+from dataclasses import dataclass
+import json
+import hashlib
+
+from json.decoder import JSONDecodeError
+
+from pydantic import ValidationError
+from pydantic_core import ErrorDetails
+from openlibrary import accounts
+from openlibrary.core.imports import Batch
+from openlibrary.plugins.importapi.import_edition_builder import import_edition_builder
+from openlibrary.catalog.add_book import (
+    IndependentlyPublished,
+    PublicationYearTooOld,
+    PublishedInFutureYear,
+    SourceNeedsISBN,
+    validate_record,
+)
+
+
+def generate_hash(data: bytes, length: int = 20):
+    """
+    Generate a SHA256 hash of data and truncate it to length.
+
+    This is used to help uniquely identify a community_batch_import. Truncating
+    to 20 characters gives about a 50% chance of collision after 1 billion
+    community hash imports. According to ChatGPT, anyway. :)
+    """
+    hash = hashlib.sha256()
+    hash.update(data)
+    return hash.hexdigest()[:length]
+
+
+@dataclass
+class CbiErrorItem:
+    """
+    Represents a Community Batch Import error item, containing a line number and error message.
+
+    As the JSONL in a community batch import file is parsed, it's validated and errors are
+    recorded by line number and the error thrown, and this item represents that information,
+    which is returned to the uploading patron.
+    """
+
+    line_number: int
+    error_message: str
+
+
+@dataclass
+class CbiResult:
+    """
+    Represents the response item from community_batch_import().
+    """
+
+    batch: Batch | None = None
+    errors: list[CbiErrorItem] | None = None
+
+
+def format_pydantic_errors(
+    errors: list[ErrorDetails], line_idx: int
+) -> list[CbiErrorItem]:
+    """
+    Format a list of Pydantic errors into list[CbiErrorItem] for easier eventual HTML representation.
+    """
+    formatted_errors = []
+    for error in errors:
+        if loc := error.get("loc"):
+            loc_str = ", ".join(map(str, loc))
+        else:
+            loc_str = ""
+        msg = error["msg"]
+        error_type = error["type"]
+        error_message = f"{loc_str} field: {msg}. (Type: {error_type})"
+        formatted_errors.append(
+            CbiErrorItem(line_number=line_idx, error_message=error_message)
+        )
+    return formatted_errors
+
+
+def community_batch_import(raw_data: bytes) -> CbiResult:
+    """
+    This process raw byte data from a JSONL POST to the community batch import endpoint.
+
+    Each line in the JSONL file (i.e. import item) must pass the same validation as
+    required for a valid import going through load().
+
+    The return value has `batch` and `errors` attributes for the caller to use to
+    access the batch and any errors, respectively. See CbiResult for more.
+
+    The line numbers errors use 1-based counting because they are line numbers in a file.
+    """
+    user = accounts.get_current_user()
+    username = user.get_username()
+    batch_name = f"cb-{generate_hash(raw_data)}"
+    errors: list[CbiErrorItem] = []
+    dict_data = []
+
+    for index, line in enumerate(raw_data.decode("utf-8").splitlines()):
+        # Allow comments with `#` in these import records, even though they're JSONL.
+        if line.startswith("#"):
+            continue
+
+        try:
+            raw_record = json.loads(line)
+            # Validate the JSON + convert to import rec format and use validation from load().
+            edition_builder = import_edition_builder(init_dict=raw_record)
+            validate_record(edition_builder.get_dict())
+
+            # Add the raw_record for later processing; it still must go througd load() independently.
+            dict_data.append(raw_record)
+        except (
+            JSONDecodeError,
+            PublicationYearTooOld,
+            PublishedInFutureYear,
+            IndependentlyPublished,
+            SourceNeedsISBN,
+        ) as e:
+            errors.append(CbiErrorItem(line_number=index + 1, error_message=str(e)))
+
+        except ValidationError as e:
+            errors.extend(format_pydantic_errors(e.errors(), index + 1))
+
+    if errors:
+        return CbiResult(errors=errors)
+
+    # Format data for queueing via batch import.
+    batch_data = [
+        {
+            'ia_id': item['source_records'][0],
+            'data': item,
+            'submitter': username,
+        }
+        for item in dict_data
+    ]
+
+    # Create the batch
+    batch = Batch.find(batch_name) or Batch.new(name=batch_name, submitter=username)
+    assert batch is not None
+    batch.add_items(batch_data)
+
+    return CbiResult(batch=batch)
diff --git a/openlibrary/core/imports.py b/openlibrary/core/imports.py
@@ -14,7 +14,6 @@
 from psycopg2.errors import UndefinedTable, UniqueViolation
 from pydantic import ValidationError
 from web.db import ResultSet
-from web.utils import Storage
 
 from . import db
 
@@ -31,17 +30,28 @@
 
 
 class Batch(web.storage):
+
+    def __init__(self, mapping, *requireds, **defaults):
+        """
+        Initialize some statistics instance attributes yet retain web.storage's __init__ method.
+        """
+        super().__init__(mapping, *requireds, **defaults)
+        self.total_tried: int = 0
+        self.total_added: int = 0
+        self.total_not_added: int = 0
+        self.items_not_added: set = set()
+
     @staticmethod
-    def find(name, create=False):
+    def find(name: str, create: bool = False) -> "Batch":  # type: ignore[return]
         result = db.query("SELECT * FROM import_batch where name=$name", vars=locals())
         if result:
             return Batch(result[0])
         elif create:
             return Batch.new(name)
 
     @staticmethod
-    def new(name):
-        db.insert("import_batch", name=name)
+    def new(name: str, submitter: str | None = None) -> "Batch":
+        db.insert("import_batch", name=name, submitter=submitter)
         return Batch.find(name=name)
 
     def load_items(self, filename):
@@ -64,6 +74,13 @@ def dedupe_items(self, items):
             self.name,
             len(already_present),
         )
+
+        # Update batch counts
+        self.total_tried = len(ia_ids)
+        self.total_not_added = len(already_present)
+        self.total_added = self.total_tried - self.total_not_added
+        self.items_not_added = already_present
+
         # Those unique items whose ia_id's aren't already present
         return [item for item in items if item.get('ia_id') not in already_present]
 
@@ -82,20 +99,23 @@ def normalize_items(self, items):
                         if item.get('data')
                         else None
                     ),
+                    'submitter': (
+                        item.get('submitter') if item.get('submitter') else None
+                    ),
                 }
             )
             for item in items
         ]
 
-    def add_items(self, items: list[str] | list[dict]) -> None:
+    def add_items(self, items: list[str] | list[dict]) -> str | None:
         """
         :param items: either a list of `ia_id`  (legacy) or a list of dicts
             containing keys `ia_id` and book `data`. In the case of
             the latter, `ia_id` will be of form e.g. "isbn:1234567890";
             i.e. of a format id_type:value which cannot be a valid IA id.
         """
         if not items:
-            return
+            return None
 
         logger.info("batch %s: adding %d items", self.name, len(items))
 
@@ -113,7 +133,7 @@ def add_items(self, items: list[str] | list[dict]) -> None:
 
             logger.info("batch %s: added %d items", self.name, len(items))
 
-        return
+        return f"batch {self.name}: added {len(items)}"
 
     def get_items(self, status="pending"):
         result = db.where("import_item", batch_id=self.id, status=status)

diff --git a/openlibrary/i18n/messages.pot b/openlibrary/i18n/messages.pot
@@ -106,6 +106,59 @@ msgstr ""
 msgid "Point your camera at a barcode! 📷"
 msgstr ""
 
+#: cbi.html
+msgid "Community Batch Imports"
+msgstr ""
+
+#: cbi.html
+msgid "Attach a JSONL formatted document with import records."
+msgstr ""
+
+#: cbi.html
+msgid ""
+"See the <a href=\"https://github.com/internetarchive/openlibrary-"
+"client/tree/master/olclient/schemata\">olclient import schemata</a> for "
+"more."
+msgstr ""
+
+#: cbi.html
+msgid "Import failed."
+msgstr ""
+
+#: cbi.html
+msgid "No import will be queued until *every* record validates successfully."
+msgstr ""
+
+#: cbi.html
+msgid ""
+"Please update the JSONL file and reload this page (there should be no "
+"need to reattach the file)."
+msgstr ""
+
+#: cbi.html
+msgid "Validation errors:"
+msgstr ""
+
+#: cbi.html
+msgid "Import results for batch:"
+msgstr ""
+
+#: cbi.html
+msgid "Total tried:"
+msgstr ""
+
+#: cbi.html
+msgid "Total queued:"
+msgstr ""
+
+#: cbi.html
+msgid "Total not queued:"
+msgstr ""
+
+#: cbi.html
+msgid "Not queued because they are already present in the queue:"
+msgstr ""
+
 #: diff.html
 #, python-format
 msgid "Diff on %s"
@@ -4392,7 +4445,7 @@ msgstr ""
 msgid "Recent Activity"
 msgstr ""
 
-#: lists/home.html
+#: lists/home.html lists/showcase.html
 msgid "See all"
 msgstr ""
 

diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py
@@ -12,6 +12,7 @@
 from openlibrary.catalog.get_ia import get_marc_record_from_ia, get_from_archive_bulk
 from openlibrary import accounts, records
 from openlibrary.core import ia
+from openlibrary.core.community_batch_imports import community_batch_import
 from openlibrary.plugins.upstream.utils import (
     LanguageNoMatchError,
     get_abbrev_from_full_lang_name,
@@ -38,6 +39,7 @@
 
 import urllib
 
+
 MARC_LENGTH_POS = 5
 logger = logging.getLogger('openlibrary.importapi')
 
@@ -130,6 +132,10 @@ def POST(self):
 
         data = web.data()
 
+        i = web.input()
+        if i.batch:
+            return community_batch_import(data)
+
         try:
             edition, _ = parse_data(data)
 

diff --git a/openlibrary/plugins/openlibrary/code.py b/openlibrary/plugins/openlibrary/code.py
@@ -15,6 +15,10 @@
 import math
 from pathlib import Path
 import infogami
+from openlibrary.core.community_batch_imports import (
+    CbiErrorItem,
+    community_batch_import,
+)
 
 # make sure infogami.config.features is set
 if not hasattr(infogami.config, 'features'):
@@ -479,6 +483,27 @@ def remove_high_priority(query: str) -> str:
     return new_query
 
 
+class community_batch_imports(delegate.page):
+    """
+    The community batch import endpoint. Expects a JSONL file POSTed via multipart/form-data.
+    """
+
+    path = '/import/batch/new'
+
+    def GET(self):
+        return render_template("cbi.html", cbi_result=None)
+
+    def POST(self):
+        if not can_write():
+            raise Forbidden('Permission Denied.')
+
+        # Get the upload from web.py. See the template for the <form> used.
+        form_data = web.input(cbi={})
+        cbi_result = community_batch_import(form_data['cbi'].file.read())
+
+        return render_template("cbi.html", cbi_result=cbi_result)
+
+
 class isbn_lookup(delegate.page):
     path = r'/(?:isbn|ISBN)/(.{10,})'