Merge pull request #371 from VikParuchuri/dev

VikParuchuri · web-flow · commit 60ca35fe10fb · 2025-05-16T14:49:55.000-07:00
Dev
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,11 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it using the following metadata."
+title: "Surya: A lightweight framework for analyzing documents and PDFs at scale"
+authors:
+  - family-names: Paruchuri
+    given-names: Vikas
+  - name: Datalab Team
+date-released: 2025-05-13
+url: https://github.com/VikParuchuri/surya
+version: 0.14.0
+repository-code: https://github.com/VikParuchuri/surya
diff --git a/README.md b/README.md
@@ -560,3 +560,16 @@ This work would not have been possible without amazing open source AI work:
 - [CRAFT](https://github.com/clovaai/CRAFT-pytorch), a great scene text detection model
 
 Thank you to everyone who makes open source AI possible.
+
+# Citation
+
+If you use surya (or the associated models) in your work or research, please consider citing us using the following BibTeX entry:
+
+```bibtex
+@misc{paruchuri2025surya,
+  author       = {Vikas Paruchuri and Datalab Team},
+  title        = {Surya: A lightweight document OCR and analysis toolkit},
+  year         = {2025},
+  howpublished = {\url{https://github.com/VikParuchuri/surya}},
+  note         = {GitHub repository},
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.14.0"
+version = "0.14.1"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"
diff --git a/surya/scripts/hf_to_s3.py b/surya/scripts/hf_to_s3.py
@@ -7,56 +7,64 @@
 from huggingface_hub import snapshot_download
 
 import click
+from tqdm import tqdm
 
 S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com"
 
+
 @click.command(help="Uploads the data from huggingface to an S3 bucket")
 @click.argument("hf_repo_id", type=str)
 @click.argument("s3_path", type=str)
 @click.option("--bucket_name", type=str, default="datalab")
 @click.option("--access_key_id", type=str, default="<access_key_id>")
 @click.option("--access_key_secret", type=str, default="<access_key_secret>")
-def main(hf_repo_id: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str):
+@click.option("--suffix", type=str, default="")
+def main(
+    hf_repo_id: str,
+    s3_path: str,
+    bucket_name: str,
+    access_key_id: str,
+    access_key_secret: str,
+    suffix: str,
+):
     curr_date = datetime.datetime.now().strftime("%Y_%m_%d")
     s3_path = f"{s3_path}/{curr_date}"
+    if suffix:
+        s3_path = f"{s3_path}_{suffix}"
 
     download_folder = snapshot_download(repo_id=hf_repo_id)
     download_folder = Path(download_folder)
     contained_files = list(download_folder.glob("*"))
-    contained_files = [f.name for f in contained_files] # Just get the base name
+    contained_files = [f.name for f in contained_files]  # Just get the base name
     manifest_file = download_folder / "manifest.json"
 
     with open(manifest_file, "w") as f:
         json.dump({"files": contained_files}, f)
 
     # Upload the files to S3
     s3_client = boto3.client(
-        's3',
+        service_name="s3",
         endpoint_url=S3_API_URL,
         aws_access_key_id=access_key_id,
         aws_secret_access_key=access_key_secret,
-        region_name="enam"
+        region_name="auto",
     )
 
     # Iterate through all files in the folder
-    for file_path in download_folder.glob('*'):
+    for file_path in tqdm(
+        download_folder.glob("*"), desc="Uploading files", unit="file"
+    ):
         s3_key = f"{s3_path}/{file_path.name}"
 
         try:
-            s3_client.upload_file(
-                str(file_path),
-                bucket_name,
-                s3_key
-            )
+            s3_client.upload_file(str(file_path), bucket_name, s3_key)
         except Exception as e:
             print(f"Error uploading {file_path}: {str(e)}")
 
     shutil.rmtree(download_folder)
 
     print(f"Uploaded files to {s3_path}")
 
+
 if __name__ == "__main__":
     main()
-
-
-
diff --git a/surya/settings.py b/surya/settings.py
@@ -75,7 +75,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
     COMPILE_DETECTOR: bool = False
 
     # Text recognition
-    RECOGNITION_MODEL_CHECKPOINT: str = "s3://text_recognition/2025_05_13"
+    RECOGNITION_MODEL_CHECKPOINT: str = "s3://text_recognition/2025_05_16"
     RECOGNITION_MODEL_QUANTIZE: bool = False
     RECOGNITION_MAX_TOKENS: Optional[int] = None
     RECOGNITION_BATCH_SIZE: Optional[int] = (
diff --git a/tests/test_recognition.py b/tests/test_recognition.py
@@ -6,4 +6,4 @@ def test_recognition(recognition_predictor, detection_predictor, test_image):
 
     text_lines = recognition_results[0].text_lines
     assert len(text_lines) == 4
-    assert text_lines[0].text == "Hello World"
+    assert "Hello World" in text_lines[0].text