35
35
36
36
/**
37
37
* FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar.
38
- *
38
+ * <p>
39
39
* Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching
40
40
*/
41
41
public class GoogleScholar implements FulltextFetcher , SearchBasedFetcher {
@@ -58,11 +58,10 @@ public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
58
58
@ Override
59
59
public Optional <URL > findFullText (BibEntry entry ) throws IOException , FetcherException {
60
60
Objects .requireNonNull (entry );
61
- Optional <URL > pdfLink = Optional .empty ();
62
61
63
62
// Search in title
64
63
if (!entry .hasField (StandardField .TITLE )) {
65
- return pdfLink ;
64
+ return Optional . empty () ;
66
65
}
67
66
68
67
try {
@@ -74,12 +73,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherExc
74
73
// as_occt field to search in
75
74
uriBuilder .addParameter ("as_occt" , "title" );
76
75
77
- pdfLink = search (uriBuilder .toString ());
76
+ return search (uriBuilder .toString ());
78
77
} catch (URISyntaxException e ) {
79
78
throw new FetcherException ("Building URI failed." , e );
80
79
}
81
-
82
- return pdfLink ;
83
80
}
84
81
85
82
@ Override
@@ -91,6 +88,11 @@ private Optional<URL> search(String url) throws IOException {
91
88
Optional <URL > pdfLink = Optional .empty ();
92
89
93
90
Document doc = Jsoup .connect (url ).userAgent (URLDownload .USER_AGENT ).get ();
91
+
92
+ if (needsCaptcha (doc .body ().html ())) {
93
+ LOGGER .warn ("Hit Google traffic limitation. Captcha prevents automatic fetching." );
94
+ return Optional .empty ();
95
+ }
94
96
// Check results for PDF link
95
97
// TODO: link always on first result or none?
96
98
for (int i = 0 ; i < NUM_RESULTS ; i ++) {
@@ -111,6 +113,10 @@ private Optional<URL> search(String url) throws IOException {
111
113
return pdfLink ;
112
114
}
113
115
116
+ private boolean needsCaptcha (String body ) {
117
+ return body .contains ("id=\" gs_captcha_ccl\" " );
118
+ }
119
+
114
120
@ Override
115
121
public String getName () {
116
122
return "Google Scholar" ;
@@ -158,6 +164,11 @@ public List<BibEntry> performSearch(String query) throws FetcherException {
158
164
private void addHitsFromQuery (List <BibEntry > entryList , String queryURL ) throws IOException , FetcherException {
159
165
String content = new URLDownload (queryURL ).asString ();
160
166
167
+ if (needsCaptcha (content )) {
168
+ throw new FetcherException ("Fetching from Google Scholar failed." ,
169
+ Localization .lang ("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)." ), null );
170
+ }
171
+
161
172
Matcher matcher = LINK_TO_BIB_PATTERN .matcher (content );
162
173
while (matcher .find ()) {
163
174
String citationsPageURL = matcher .group ().replace ("&" , "&" );
0 commit comments