Skip to content

Commit 60ca35f

Browse files
authored
Merge pull request #371 from VikParuchuri/dev
Dev
2 parents 612722a + 035aea4 commit 60ca35f

File tree

6 files changed

+48
-16
lines changed

6 files changed

+48
-16
lines changed

CITATION.cff

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
cff-version: 1.2.0
2+
message: "If you use this software, please cite it using the following metadata."
3+
title: "Surya: A lightweight framework for analyzing documents and PDFs at scale"
4+
authors:
5+
- family-names: Paruchuri
6+
given-names: Vikas
7+
- name: Datalab Team
8+
date-released: 2025-05-13
9+
url: https://github.com/VikParuchuri/surya
10+
version: 0.14.0
11+
repository-code: https://github.com/VikParuchuri/surya

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,3 +560,16 @@ This work would not have been possible without amazing open source AI work:
560560
- [CRAFT](https://github.com/clovaai/CRAFT-pytorch), a great scene text detection model
561561

562562
Thank you to everyone who makes open source AI possible.
563+
564+
# Citation
565+
566+
If you use surya (or the associated models) in your work or research, please consider citing us using the following BibTeX entry:
567+
568+
```bibtex
569+
@misc{paruchuri2025surya,
570+
author = {Vikas Paruchuri and Datalab Team},
571+
title = {Surya: A lightweight document OCR and analysis toolkit},
572+
year = {2025},
573+
howpublished = {\url{https://github.com/VikParuchuri/surya}},
574+
note = {GitHub repository},
575+
}

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "surya-ocr"
3-
version = "0.14.0"
3+
version = "0.14.1"
44
description = "OCR, layout, reading order, and table recognition in 90+ languages"
55
authors = ["Vik Paruchuri <[email protected]>"]
66
readme = "README.md"

surya/scripts/hf_to_s3.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,56 +7,64 @@
77
from huggingface_hub import snapshot_download
88

99
import click
10+
from tqdm import tqdm
1011

1112
S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com"
1213

14+
1315
@click.command(help="Uploads the data from huggingface to an S3 bucket")
1416
@click.argument("hf_repo_id", type=str)
1517
@click.argument("s3_path", type=str)
1618
@click.option("--bucket_name", type=str, default="datalab")
1719
@click.option("--access_key_id", type=str, default="<access_key_id>")
1820
@click.option("--access_key_secret", type=str, default="<access_key_secret>")
19-
def main(hf_repo_id: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str):
21+
@click.option("--suffix", type=str, default="")
22+
def main(
23+
hf_repo_id: str,
24+
s3_path: str,
25+
bucket_name: str,
26+
access_key_id: str,
27+
access_key_secret: str,
28+
suffix: str,
29+
):
2030
curr_date = datetime.datetime.now().strftime("%Y_%m_%d")
2131
s3_path = f"{s3_path}/{curr_date}"
32+
if suffix:
33+
s3_path = f"{s3_path}_{suffix}"
2234

2335
download_folder = snapshot_download(repo_id=hf_repo_id)
2436
download_folder = Path(download_folder)
2537
contained_files = list(download_folder.glob("*"))
26-
contained_files = [f.name for f in contained_files] # Just get the base name
38+
contained_files = [f.name for f in contained_files] # Just get the base name
2739
manifest_file = download_folder / "manifest.json"
2840

2941
with open(manifest_file, "w") as f:
3042
json.dump({"files": contained_files}, f)
3143

3244
# Upload the files to S3
3345
s3_client = boto3.client(
34-
's3',
46+
service_name="s3",
3547
endpoint_url=S3_API_URL,
3648
aws_access_key_id=access_key_id,
3749
aws_secret_access_key=access_key_secret,
38-
region_name="enam"
50+
region_name="auto",
3951
)
4052

4153
# Iterate through all files in the folder
42-
for file_path in download_folder.glob('*'):
54+
for file_path in tqdm(
55+
download_folder.glob("*"), desc="Uploading files", unit="file"
56+
):
4357
s3_key = f"{s3_path}/{file_path.name}"
4458

4559
try:
46-
s3_client.upload_file(
47-
str(file_path),
48-
bucket_name,
49-
s3_key
50-
)
60+
s3_client.upload_file(str(file_path), bucket_name, s3_key)
5161
except Exception as e:
5262
print(f"Error uploading {file_path}: {str(e)}")
5363

5464
shutil.rmtree(download_folder)
5565

5666
print(f"Uploaded files to {s3_path}")
5767

68+
5869
if __name__ == "__main__":
5970
main()
60-
61-
62-

surya/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
7575
COMPILE_DETECTOR: bool = False
7676

7777
# Text recognition
78-
RECOGNITION_MODEL_CHECKPOINT: str = "s3://text_recognition/2025_05_13"
78+
RECOGNITION_MODEL_CHECKPOINT: str = "s3://text_recognition/2025_05_16"
7979
RECOGNITION_MODEL_QUANTIZE: bool = False
8080
RECOGNITION_MAX_TOKENS: Optional[int] = None
8181
RECOGNITION_BATCH_SIZE: Optional[int] = (

tests/test_recognition.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ def test_recognition(recognition_predictor, detection_predictor, test_image):
66

77
text_lines = recognition_results[0].text_lines
88
assert len(text_lines) == 4
9-
assert text_lines[0].text == "Hello World"
9+
assert "Hello World" in text_lines[0].text

0 commit comments

Comments
 (0)