Skip to content

Community Batch Imports endpoint #8122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions openlibrary/core/community_batch_imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from dataclasses import dataclass
import json
import hashlib

from json.decoder import JSONDecodeError

from pydantic import ValidationError
from pydantic_core import ErrorDetails
from openlibrary import accounts
from openlibrary.core.imports import Batch
from openlibrary.plugins.importapi.import_edition_builder import import_edition_builder
from openlibrary.catalog.add_book import (
IndependentlyPublished,
PublicationYearTooOld,
PublishedInFutureYear,
SourceNeedsISBN,
validate_record,
)


def generate_hash(data: bytes, length: int = 20):
"""
Generate a SHA256 hash of data and truncate it to length.

This is used to help uniquely identify a community_batch_import. Truncating
to 20 characters gives about a 50% chance of collision after 1 billion
community hash imports. According to ChatGPT, anyway. :)
"""
hash = hashlib.sha256()
hash.update(data)
return hash.hexdigest()[:length]


@dataclass
class CbiErrorItem:
"""
Represents a Community Batch Import error item, containing a line number and error message.

As the JSONL in a community batch import file is parsed, it's validated and errors are
recorded by line number and the error thrown, and this item represents that information,
which is returned to the uploading patron.
"""

line_number: int
error_message: str


@dataclass
class CbiResult:
"""
Represents the response item from community_batch_import().
"""

batch: Batch | None = None
errors: list[CbiErrorItem] | None = None


def format_pydantic_errors(
errors: list[ErrorDetails], line_idx: int
) -> list[CbiErrorItem]:
"""
Format a list of Pydantic errors into list[CbiErrorItem] for easier eventual HTML representation.
"""
formatted_errors = []
for error in errors:
if loc := error.get("loc"):
loc_str = ", ".join(map(str, loc))
else:
loc_str = ""
msg = error["msg"]
error_type = error["type"]
error_message = f"{loc_str} field: {msg}. (Type: {error_type})"
formatted_errors.append(
CbiErrorItem(line_number=line_idx, error_message=error_message)
)
return formatted_errors


def community_batch_import(raw_data: bytes) -> CbiResult:
"""
This process raw byte data from a JSONL POST to the community batch import endpoint.

Each line in the JSONL file (i.e. import item) must pass the same validation as
required for a valid import going through load().

The return value has `batch` and `errors` attributes for the caller to use to
access the batch and any errors, respectively. See CbiResult for more.

The line numbers errors use 1-based counting because they are line numbers in a file.
"""
user = accounts.get_current_user()
username = user.get_username()
batch_name = f"cb-{generate_hash(raw_data)}"
errors: list[CbiErrorItem] = []
dict_data = []

for index, line in enumerate(raw_data.decode("utf-8").splitlines()):
# Allow comments with `#` in these import records, even though they're JSONL.
if line.startswith("#"):
continue

try:
raw_record = json.loads(line)
# Validate the JSON + convert to import rec format and use validation from load().
edition_builder = import_edition_builder(init_dict=raw_record)
validate_record(edition_builder.get_dict())

# Add the raw_record for later processing; it still must go througd load() independently.
dict_data.append(raw_record)
except (
JSONDecodeError,
PublicationYearTooOld,
PublishedInFutureYear,
IndependentlyPublished,
SourceNeedsISBN,
) as e:
errors.append(CbiErrorItem(line_number=index + 1, error_message=str(e)))

except ValidationError as e:
errors.extend(format_pydantic_errors(e.errors(), index + 1))

if errors:
return CbiResult(errors=errors)

# Format data for queueing via batch import.
batch_data = [
{
'ia_id': item['source_records'][0],
'data': item,
'submitter': username,
}
for item in dict_data
]

# Create the batch
batch = Batch.find(batch_name) or Batch.new(name=batch_name, submitter=username)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in the future we should move away from treating batch_name like a unqiue string; it should just be effectively a comment. The batch id is what should be... the id :P But future PR problem, I think there were some good reasons why we did this.

assert batch is not None
batch.add_items(batch_data)

return CbiResult(batch=batch)
34 changes: 27 additions & 7 deletions openlibrary/core/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from psycopg2.errors import UndefinedTable, UniqueViolation
from pydantic import ValidationError
from web.db import ResultSet
from web.utils import Storage

from . import db

Expand All @@ -31,17 +30,28 @@


class Batch(web.storage):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Todo in a future PR: make this not extend web.storage


def __init__(self, mapping, *requireds, **defaults):
"""
Initialize some statistics instance attributes yet retain web.storage's __init__ method.
"""
super().__init__(mapping, *requireds, **defaults)
self.total_tried: int = 0
self.total_added: int = 0
self.total_not_added: int = 0
self.items_not_added: set = set()

@staticmethod
def find(name, create=False):
def find(name: str, create: bool = False) -> "Batch": # type: ignore[return]
result = db.query("SELECT * FROM import_batch where name=$name", vars=locals())
if result:
return Batch(result[0])
elif create:
return Batch.new(name)

@staticmethod
def new(name):
db.insert("import_batch", name=name)
def new(name: str, submitter: str | None = None) -> "Batch":
db.insert("import_batch", name=name, submitter=submitter)
return Batch.find(name=name)

def load_items(self, filename):
Expand All @@ -64,6 +74,13 @@ def dedupe_items(self, items):
self.name,
len(already_present),
)

# Update batch counts
self.total_tried = len(ia_ids)
self.total_not_added = len(already_present)
self.total_added = self.total_tried - self.total_not_added
self.items_not_added = already_present

# Those unique items whose ia_id's aren't already present
return [item for item in items if item.get('ia_id') not in already_present]

Expand All @@ -82,20 +99,23 @@ def normalize_items(self, items):
if item.get('data')
else None
),
'submitter': (
item.get('submitter') if item.get('submitter') else None
),
}
)
for item in items
]

def add_items(self, items: list[str] | list[dict]) -> None:
def add_items(self, items: list[str] | list[dict]) -> str | None:
"""
:param items: either a list of `ia_id` (legacy) or a list of dicts
containing keys `ia_id` and book `data`. In the case of
the latter, `ia_id` will be of form e.g. "isbn:1234567890";
i.e. of a format id_type:value which cannot be a valid IA id.
"""
if not items:
return
return None

logger.info("batch %s: adding %d items", self.name, len(items))

Expand All @@ -113,7 +133,7 @@ def add_items(self, items: list[str] | list[dict]) -> None:

logger.info("batch %s: added %d items", self.name, len(items))

return
return f"batch {self.name}: added {len(items)}"

def get_items(self, status="pending"):
result = db.where("import_item", batch_id=self.id, status=status)
Expand Down
55 changes: 54 additions & 1 deletion openlibrary/i18n/messages.pot
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,59 @@ msgstr ""
msgid "Point your camera at a barcode! 📷"
msgstr ""

#: cbi.html
msgid "Community Batch Imports"
msgstr ""

#: cbi.html
msgid "Attach a JSONL formatted document with import records."
msgstr ""

#: cbi.html
msgid ""
"See the <a href=\"https://github.com/internetarchive/openlibrary-"
"client/tree/master/olclient/schemata\">olclient import schemata</a> for "
"more."
msgstr ""

#: cbi.html
msgid "Import failed."
msgstr ""

#: cbi.html
msgid "No import will be queued until *every* record validates successfully."
msgstr ""

#: cbi.html
msgid ""
"Please update the JSONL file and reload this page (there should be no "
"need to reattach the file)."
msgstr ""

#: cbi.html
msgid "Validation errors:"
msgstr ""

#: cbi.html
msgid "Import results for batch:"
msgstr ""

#: cbi.html
msgid "Total tried:"
msgstr ""

#: cbi.html
msgid "Total queued:"
msgstr ""

#: cbi.html
msgid "Total not queued:"
msgstr ""

#: cbi.html
msgid "Not queued because they are already present in the queue:"
msgstr ""

#: diff.html
#, python-format
msgid "Diff on %s"
Expand Down Expand Up @@ -4392,7 +4445,7 @@ msgstr ""
msgid "Recent Activity"
msgstr ""

#: lists/home.html
#: lists/home.html lists/showcase.html
msgid "See all"
msgstr ""

Expand Down
6 changes: 6 additions & 0 deletions openlibrary/plugins/importapi/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from openlibrary.catalog.get_ia import get_marc_record_from_ia, get_from_archive_bulk
from openlibrary import accounts, records
from openlibrary.core import ia
from openlibrary.core.community_batch_imports import community_batch_import
from openlibrary.plugins.upstream.utils import (
LanguageNoMatchError,
get_abbrev_from_full_lang_name,
Expand All @@ -38,6 +39,7 @@

import urllib


MARC_LENGTH_POS = 5
logger = logging.getLogger('openlibrary.importapi')

Expand Down Expand Up @@ -130,6 +132,10 @@ def POST(self):

data = web.data()

i = web.input()
if i.batch:
return community_batch_import(data)

try:
edition, _ = parse_data(data)

Expand Down
25 changes: 25 additions & 0 deletions openlibrary/plugins/openlibrary/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
import math
from pathlib import Path
import infogami
from openlibrary.core.community_batch_imports import (
CbiErrorItem,
community_batch_import,
)

# make sure infogami.config.features is set
if not hasattr(infogami.config, 'features'):
Expand Down Expand Up @@ -479,6 +483,27 @@ def remove_high_priority(query: str) -> str:
return new_query


class community_batch_imports(delegate.page):
"""
The community batch import endpoint. Expects a JSONL file POSTed via multipart/form-data.
"""

path = '/import/batch/new'

def GET(self):
return render_template("cbi.html", cbi_result=None)

def POST(self):
if not can_write():
raise Forbidden('Permission Denied.')

# Get the upload from web.py. See the template for the <form> used.
form_data = web.input(cbi={})
cbi_result = community_batch_import(form_data['cbi'].file.read())

return render_template("cbi.html", cbi_result=cbi_result)


class isbn_lookup(delegate.page):
path = r'/(?:isbn|ISBN)/(.{10,})'

Expand Down
Loading
Loading