Skip to content

Commit 0f93e22

Browse files
committed
Minor improvements for Google scholar fetcher
Detect Google captcha div. Follow up PRs might show or give more info to the user.
1 parent 8f68de9 commit 0f93e22

File tree

1 file changed

+17
-6
lines changed

1 file changed

+17
-6
lines changed

src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java

+17-6
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
/**
3737
* FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar.
38-
*
38+
* <p>
3939
* Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching
4040
*/
4141
public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {
@@ -58,11 +58,10 @@ public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
5858
@Override
5959
public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
6060
Objects.requireNonNull(entry);
61-
Optional<URL> pdfLink = Optional.empty();
6261

6362
// Search in title
6463
if (!entry.hasField(StandardField.TITLE)) {
65-
return pdfLink;
64+
return Optional.empty();
6665
}
6766

6867
try {
@@ -74,12 +73,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherExc
7473
// as_occt field to search in
7574
uriBuilder.addParameter("as_occt", "title");
7675

77-
pdfLink = search(uriBuilder.toString());
76+
return search(uriBuilder.toString());
7877
} catch (URISyntaxException e) {
7978
throw new FetcherException("Building URI failed.", e);
8079
}
81-
82-
return pdfLink;
8380
}
8481

8582
@Override
@@ -91,6 +88,11 @@ private Optional<URL> search(String url) throws IOException {
9188
Optional<URL> pdfLink = Optional.empty();
9289

9390
Document doc = Jsoup.connect(url).userAgent(URLDownload.USER_AGENT).get();
91+
92+
if (needsCaptcha(doc.body().html())) {
93+
LOGGER.warn("Hit Google traffic limitation. Captcha prevents automatic fetching.");
94+
return Optional.empty();
95+
}
9496
// Check results for PDF link
9597
// TODO: link always on first result or none?
9698
for (int i = 0; i < NUM_RESULTS; i++) {
@@ -111,6 +113,10 @@ private Optional<URL> search(String url) throws IOException {
111113
return pdfLink;
112114
}
113115

116+
private boolean needsCaptcha(String body) {
117+
return body.contains("id=\"gs_captcha_ccl\"");
118+
}
119+
114120
@Override
115121
public String getName() {
116122
return "Google Scholar";
@@ -158,6 +164,11 @@ public List<BibEntry> performSearch(String query) throws FetcherException {
158164
private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws IOException, FetcherException {
159165
String content = new URLDownload(queryURL).asString();
160166

167+
if (needsCaptcha(content)) {
168+
throw new FetcherException("Fetching from Google Scholar failed.",
169+
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null);
170+
}
171+
161172
Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content);
162173
while (matcher.find()) {
163174
String citationsPageURL = matcher.group().replace("&amp;", "&");

0 commit comments

Comments
 (0)