Skip to content

Commit 6df693f

Browse files
committed
Add embeddings search to example app, closes #6
1 parent 1b51641 commit 6df693f

File tree

2 files changed

+47
-7
lines changed

2 files changed

+47
-7
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,6 @@ Python 3.9+ is supported. Using a Python [virtual environment](https://docs.pyth
5555
pip install git+https://github.com/neuml/annotateai
5656
```
5757

58-
Python 3.9+ is supported
59-
6058
## Examples
6159

6260
`annotateai` can annotate any PDF but it works especially well for medical and scientific papers. The following shows a series of examples using papers from [arXiv](https://arxiv.org/).

app/app.py

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@
88
import os
99
import platform
1010

11+
from urllib.parse import urlparse
12+
1113
import streamlit as st
1214

1315
from streamlit_pdf_viewer import pdf_viewer
1416

17+
from txtai import Embeddings
18+
1519
from annotateai import Annotate
1620

1721

@@ -36,6 +40,9 @@ def __init__(self):
3640
)
3741
)
3842

43+
# Embeddings database for search (lazy loaded)
44+
self.embeddings = None
45+
3946
def run(self):
4047
"""
4148
Main rendering logic.
@@ -56,12 +63,16 @@ def run(self):
5663
selected = st.session_state.get("selected")
5764

5865
# Create URL input using selected example, if applicable
59-
url = url.text_input("**URL or Local File Path**", value=examples.get(selected, ""))
66+
url = url.text_input("**URL / Local File Path / Search**", value=examples.get(selected, ""))
6067

6168
# Annotate the URL
6269
if url:
70+
# Check if URL is valid, otherwise run an embeddings search
71+
url = self.validate(url)
72+
6373
# Build the annotation file for URL
6474
with st.spinner(f"Generating annotations for {url}"):
75+
# Get the annotated output
6576
output = self.build(url)
6677

6778
# Get url file name
@@ -88,6 +99,35 @@ def onchange(self):
8899

89100
st.session_state.selected = st.session_state.example
90101
st.session_state.example = None
102+
st.session_state.url = None
103+
104+
def validate(self, url):
105+
"""
106+
Checks if input is a url or local file path. Otherwise, this runs a search and returns
107+
the url for the top result.
108+
109+
Args:
110+
url: input url, local file path or search query
111+
112+
Returns:
113+
url
114+
"""
115+
116+
# Check if this is a URL or local file path
117+
if urlparse(url).scheme in ("http", "https") or os.path.exists(url):
118+
return url
119+
120+
# Lazy load of txtai-arxiv embeddings database
121+
if not self.embeddings:
122+
with st.spinner("Loading txtai-arxiv embeddings index for search"):
123+
self.embeddings = Embeddings().load(provider="huggingface-hub", container="neuml/txtai-arxiv")
124+
125+
# Get top matching article
126+
result = self.embeddings.search(url, 1)[0]
127+
title = result["text"].split("\n")[0].replace("\n", " ")
128+
129+
st.toast(f"Ran search for {url} and using top match `{title}`")
130+
return f"https://arxiv.org/pdf/{result['id']}"
91131

92132
# pylint: disable=E0213
93133
@st.cache_data(show_spinner=False)
@@ -131,10 +171,12 @@ def create():
131171

132172
st.markdown(
133173
"""
134-
This application automatically annotates a paper using LLMs.
135-
136-
_Try PDFs from [arXiv](https://arxiv.org/), [PubMed](https://pubmed.ncbi.nlm.nih.gov/),
137-
[bioRxiv](https://www.biorxiv.org/) or [medRxiv](https://www.medrxiv.org/)!_
174+
This application automatically annotates papers using LLMs.
175+
176+
`Annotate URLs or local file paths, if found. Otherwise, the top result from the txtai-arxiv embeddings database is returned for the input.`
177+
178+
Try PDFs from [arXiv](https://arxiv.org/), [PubMed](https://pubmed.ncbi.nlm.nih.gov/),
179+
[bioRxiv](https://www.biorxiv.org/) or [medRxiv](https://www.medrxiv.org/)!
138180
"""
139181
)
140182

0 commit comments

Comments
 (0)