8
8
import os
9
9
import platform
10
10
11
+ from urllib .parse import urlparse
12
+
11
13
import streamlit as st
12
14
13
15
from streamlit_pdf_viewer import pdf_viewer
14
16
17
+ from txtai import Embeddings
18
+
15
19
from annotateai import Annotate
16
20
17
21
@@ -36,6 +40,9 @@ def __init__(self):
36
40
)
37
41
)
38
42
43
+ # Embeddings database for search (lazy loaded)
44
+ self .embeddings = None
45
+
39
46
def run (self ):
40
47
"""
41
48
Main rendering logic.
@@ -56,12 +63,16 @@ def run(self):
56
63
selected = st .session_state .get ("selected" )
57
64
58
65
# Create URL input using selected example, if applicable
59
- url = url .text_input ("**URL or Local File Path**" , value = examples .get (selected , "" ))
66
+ url = url .text_input ("**URL / Local File Path / Search **" , value = examples .get (selected , "" ))
60
67
61
68
# Annotate the URL
62
69
if url :
70
+ # Check if URL is valid, otherwise run an embeddings search
71
+ url = self .validate (url )
72
+
63
73
# Build the annotation file for URL
64
74
with st .spinner (f"Generating annotations for { url } " ):
75
+ # Get the annotated output
65
76
output = self .build (url )
66
77
67
78
# Get url file name
@@ -88,6 +99,35 @@ def onchange(self):
88
99
89
100
st .session_state .selected = st .session_state .example
90
101
st .session_state .example = None
102
+ st .session_state .url = None
103
+
104
+ def validate (self , url ):
105
+ """
106
+ Checks if input is a url or local file path. Otherwise, this runs a search and returns
107
+ the url for the top result.
108
+
109
+ Args:
110
+ url: input url, local file path or search query
111
+
112
+ Returns:
113
+ url
114
+ """
115
+
116
+ # Check if this is a URL or local file path
117
+ if urlparse (url ).scheme in ("http" , "https" ) or os .path .exists (url ):
118
+ return url
119
+
120
+ # Lazy load of txtai-arxiv embeddings database
121
+ if not self .embeddings :
122
+ with st .spinner ("Loading txtai-arxiv embeddings index for search" ):
123
+ self .embeddings = Embeddings ().load (provider = "huggingface-hub" , container = "neuml/txtai-arxiv" )
124
+
125
+ # Get top matching article
126
+ result = self .embeddings .search (url , 1 )[0 ]
127
+ title = result ["text" ].split ("\n " )[0 ].replace ("\n " , " " )
128
+
129
+ st .toast (f"Ran search for { url } and using top match `{ title } `" )
130
+ return f"https://arxiv.org/pdf/{ result ['id' ]} "
91
131
92
132
# pylint: disable=E0213
93
133
@st .cache_data (show_spinner = False )
@@ -131,10 +171,12 @@ def create():
131
171
132
172
st .markdown (
133
173
"""
134
- This application automatically annotates a paper using LLMs.
135
-
136
- _Try PDFs from [arXiv](https://arxiv.org/), [PubMed](https://pubmed.ncbi.nlm.nih.gov/),
137
- [bioRxiv](https://www.biorxiv.org/) or [medRxiv](https://www.medrxiv.org/)!_
174
+ This application automatically annotates papers using LLMs.
175
+
176
+ `Annotate URLs or local file paths, if found. Otherwise, the top result from the txtai-arxiv embeddings database is returned for the input.`
177
+
178
+ Try PDFs from [arXiv](https://arxiv.org/), [PubMed](https://pubmed.ncbi.nlm.nih.gov/),
179
+ [bioRxiv](https://www.biorxiv.org/) or [medRxiv](https://www.medrxiv.org/)!
138
180
"""
139
181
)
140
182
0 commit comments