Skip to content

Upload functionality #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions amcat4apiclient/amcat4apiclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,14 @@ def query(self, index: str, q: Optional[str]= None, *,
params['scroll_id'] = d['meta']['scroll_id']


def upload(self, index: str, documents: list):
"""
Upload a set of documents to the server

:param index: The name of the index
:param documents: A list of dictionaries with at least the keys date, title, text
:return: response of the POST request to the server
"""
url = f"{self.host}/index/{index}/documents"
r = requests.post(url, auth=(self.username, self.password), json=documents)
return r
74 changes: 74 additions & 0 deletions demo-batchuploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python3
'''
Uploads multiple (gzipped or not) JSON files to AmCAT
'''


import json
import gzip
import argparse
import os
from glob import glob
from tqdm import tqdm
from amcat4apiclient.amcat4apiclient import AmcatClient



def _chunker(iterable, chunksize=100):
'''Yield successive chunks from an iterable (e.g., list, generator)'''
chunk = []
for item in iterable:
if len(chunk) >= chunksize:
yield chunk
chunk = [item]
else:
chunk.append(item)
if chunk:
yield chunk


def _cleandoc(doc: dict):
'''Ensure that document conforms to AmCAT requirements'''
# rename 'publication_date' to 'date'; handle missing dates
doc['date'] = doc.pop('publication_date','1900-01-01')
# handle missing text
if 'text' not in doc: doc['text']=''
return doc


def read_file(fn, jsonlines=True):
if not jsonlines:
raise NotImplementedError("Still need to import logic to support both JSON and JSON-lines")

if fn[-3:].lower()=='.gz':
with gzip.open(fn, "rb") as f:
for line in f:
yield json.loads(line)
else:
with open(fn, "rb") as f:
for line in f:
yield json.loads(line)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__,
epilog = "Set the environment variables AMCATUSER and "\
"AMCATPASSWORD to use non-default credentials")
parser.add_argument('index',
help="The name of the index ('project') to upload to")
parser.add_argument('url', default='http://127.0.0.1:5000',
help='The address of the AmCAT server')
parser.add_argument('files', help='Glob pattern of json(.gz) files')

args = parser.parse_args()
user = os.environ.get("AMCATUSER","admin")
passwd = os.environ.get("AMCATPASSWORD","admin")
amcat = AmcatClient(args.url, user, passwd)
allfiles = glob(args.files)
for fn in tqdm(allfiles):
print(f"Processing {fn}...")
data = read_file(fn)
for chunk in tqdm(_chunker(data)):
cleanchunk = [_cleandoc(art) for art in chunk]
r = amcat.upload('incatransfer', cleanchunk)
print(r.status_code)