diff --git a/.idea/runConfigurations/JabRef_Main.xml b/.idea/runConfigurations/JabRef_Main.xml deleted file mode 100644 index 731578b5c75..00000000000 --- a/.idea/runConfigurations/JabRef_Main.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - diff --git a/CHANGELOG.md b/CHANGELOG.md index f4dba622ad6..68b1af4651e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve - We fixed an issue where a message about changed metadata would occur on saving although nothing changed. [#9159](https://github.com/JabRef/jabref/issues/9159) - When adding or editing a subgroup it is placed w.r.t. to alphabetical ordering rather than at the end. [koppor#577](https://github.com/koppor/jabref/issues/577) - We modified the Directory of Open Access Books (DOAB) fetcher so that it will now also fetch the ISBN when possible. [#8708](https://github.com/JabRef/jabref/issues/8708) +- ArXiv fetcher now retrieves additional data from related DOIs (both ArXiv and user-assigned). [#9170](https://github.com/JabRef/jabref/pull/9170) - Groups context menu now shows appropriate options depending on number of subgroups. [koppor#579](https://github.com/koppor/jabref/issues/579) - We changed the color of found text from red to high-contrast colors (background: yellow; font color: purple). [koppor#552](https://github.com/koppor/jabref/issues/552) - We modified the "Delete file" dialog and add the full file path to the dialog text. The file path in the title was changed to file name only. [koppor#534](https://github.com/koppor/jabref/issues/534) diff --git a/src/main/java/org/jabref/gui/externalfiles/ImportHandler.java b/src/main/java/org/jabref/gui/externalfiles/ImportHandler.java index 38fd37299af..c968bedff7c 100644 --- a/src/main/java/org/jabref/gui/externalfiles/ImportHandler.java +++ b/src/main/java/org/jabref/gui/externalfiles/ImportHandler.java @@ -29,7 +29,7 @@ import org.jabref.logic.importer.ImportFormatReader; import org.jabref.logic.importer.ImportFormatReader.UnknownFormatImport; import org.jabref.logic.importer.ParseException; -import org.jabref.logic.importer.fetcher.ArXiv; +import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.DoiFetcher; import org.jabref.logic.importer.fileformat.BibtexParser; import org.jabref.logic.l10n.Localization; @@ -301,7 +301,7 @@ private List fetchByDOI(DOI doi) throws FetcherException { private List fetchByArXiv(ArXivIdentifier arXivIdentifier) throws FetcherException { LOGGER.info("Found arxiv identifier in clipboard"); - Optional entry = new ArXiv(preferencesService.getImportFormatPreferences()).performSearchById(arXivIdentifier.getNormalizedWithoutVersion()); + Optional entry = new ArXivFetcher(preferencesService.getImportFormatPreferences()).performSearchById(arXivIdentifier.getNormalizedWithoutVersion()); return OptionalUtil.toList(entry); } } diff --git a/src/main/java/org/jabref/gui/maintable/CellFactory.java b/src/main/java/org/jabref/gui/maintable/CellFactory.java index 13c65f6a336..e3309e94c1d 100644 --- a/src/main/java/org/jabref/gui/maintable/CellFactory.java +++ b/src/main/java/org/jabref/gui/maintable/CellFactory.java @@ -36,7 +36,7 @@ public CellFactory(PreferencesService preferencesService, UndoManager undoManage TABLE_ICONS.put(new UnknownField("citeseerurl"), icon); icon = IconTheme.JabRefIcons.WWW; - // icon.setToolTipText(Localization.lang("Open") + " ArXiv URL"); + // icon.setToolTipText(Localization.lang("Open") + " ArXivFetcher URL"); TABLE_ICONS.put(StandardField.EPRINT, icon); icon = IconTheme.JabRefIcons.DOI; diff --git a/src/main/java/org/jabref/logic/importer/CompositeIdFetcher.java b/src/main/java/org/jabref/logic/importer/CompositeIdFetcher.java index ee58f55a4b3..2db51889986 100644 --- a/src/main/java/org/jabref/logic/importer/CompositeIdFetcher.java +++ b/src/main/java/org/jabref/logic/importer/CompositeIdFetcher.java @@ -2,7 +2,7 @@ import java.util.Optional; -import org.jabref.logic.importer.fetcher.ArXiv; +import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.DoiFetcher; import org.jabref.logic.importer.fetcher.isbntobibtex.DoiToBibtexConverterComIsbnFetcher; import org.jabref.logic.importer.fetcher.isbntobibtex.EbookDeIsbnFetcher; @@ -27,7 +27,7 @@ public Optional performSearchById(String identifier) throws FetcherExc } Optional arXivIdentifier = ArXivIdentifier.parse(identifier); if (arXivIdentifier.isPresent()) { - return new ArXiv(importFormatPreferences).performSearchById(arXivIdentifier.get().getNormalized()); + return new ArXivFetcher(importFormatPreferences).performSearchById(arXivIdentifier.get().getNormalized()); } Optional isbn = ISBN.parse(identifier); if (isbn.isPresent()) { diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java index 69b0dbb692d..14caa4031f2 100644 --- a/src/main/java/org/jabref/logic/importer/WebFetchers.java +++ b/src/main/java/org/jabref/logic/importer/WebFetchers.java @@ -10,7 +10,7 @@ import org.jabref.logic.importer.fetcher.ACMPortalFetcher; import org.jabref.logic.importer.fetcher.ACS; import org.jabref.logic.importer.fetcher.ApsFetcher; -import org.jabref.logic.importer.fetcher.ArXiv; +import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem; import org.jabref.logic.importer.fetcher.BiodiversityLibrary; import org.jabref.logic.importer.fetcher.CiteSeer; @@ -70,7 +70,7 @@ public static Optional getIdBasedFetcherForField(Field field, Im .addRetryFetcher(new EbookDeIsbnFetcher(importFormatPreferences)) .addRetryFetcher(new DoiToBibtexConverterComIsbnFetcher(importFormatPreferences)); } else if (field == EPRINT) { - fetcher = new ArXiv(importFormatPreferences); + fetcher = new ArXivFetcher(importFormatPreferences); } else { return Optional.empty(); } @@ -98,7 +98,7 @@ public static Optional> getIdFetcherForField(Fie */ public static SortedSet getSearchBasedFetchers(ImportFormatPreferences importFormatPreferences, ImporterPreferences importerPreferences) { SortedSet set = new TreeSet<>(Comparator.comparing(WebFetcher::getName)); - set.add(new ArXiv(importFormatPreferences)); + set.add(new ArXivFetcher(importFormatPreferences)); set.add(new INSPIREFetcher(importFormatPreferences)); set.add(new GvkFetcher()); set.add(new MedlineFetcher()); @@ -129,7 +129,7 @@ public static SortedSet getSearchBasedFetchers(ImportFormatP public static SortedSet getIdBasedFetchers(ImportFormatPreferences importFormatPreferences, ImporterPreferences importerPreferences) { SortedSet set = new TreeSet<>(Comparator.comparing(WebFetcher::getName)); - set.add(new ArXiv(importFormatPreferences)); + set.add(new ArXivFetcher(importFormatPreferences)); set.add(new AstrophysicsDataSystem(importFormatPreferences, importerPreferences)); set.add(new IsbnFetcher(importFormatPreferences) .addRetryFetcher(new EbookDeIsbnFetcher(importFormatPreferences)) @@ -177,7 +177,7 @@ public static SortedSet getEntryBasedFetchers(ImporterPrefere public static SortedSet> getIdFetchers(ImportFormatPreferences importFormatPreferences) { SortedSet> set = new TreeSet<>(Comparator.comparing(WebFetcher::getName)); set.add(new CrossRef()); - set.add(new ArXiv(importFormatPreferences)); + set.add(new ArXivFetcher(importFormatPreferences)); return set; } @@ -194,7 +194,7 @@ public static Set getFullTextFetchers(ImportFormatPreferences i fetchers.add(new ScienceDirect(importerPreferences)); fetchers.add(new SpringerLink(importerPreferences)); fetchers.add(new ACS()); - fetchers.add(new ArXiv(importFormatPreferences)); + fetchers.add(new ArXivFetcher(importFormatPreferences)); fetchers.add(new IEEE(importFormatPreferences, importerPreferences)); fetchers.add(new ApsFetcher()); diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java deleted file mode 100644 index cf8c8aec2ec..00000000000 --- a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java +++ /dev/null @@ -1,424 +0,0 @@ -package org.jabref.logic.importer.fetcher; - -import java.io.IOException; -import java.net.HttpURLConnection; -import java.net.MalformedURLException; -import java.net.URISyntaxException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Objects; -import java.util.Optional; -import java.util.stream.Collectors; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; - -import org.jabref.logic.cleanup.EprintCleanup; -import org.jabref.logic.help.HelpFile; -import org.jabref.logic.importer.FetcherException; -import org.jabref.logic.importer.FulltextFetcher; -import org.jabref.logic.importer.IdBasedFetcher; -import org.jabref.logic.importer.IdFetcher; -import org.jabref.logic.importer.ImportFormatPreferences; -import org.jabref.logic.importer.PagedSearchBasedFetcher; -import org.jabref.logic.importer.fetcher.transformers.ArXivQueryTransformer; -import org.jabref.logic.util.io.XMLUtil; -import org.jabref.logic.util.strings.StringSimilarity; -import org.jabref.model.entry.BibEntry; -import org.jabref.model.entry.LinkedFile; -import org.jabref.model.entry.field.StandardField; -import org.jabref.model.entry.identifier.ArXivIdentifier; -import org.jabref.model.entry.identifier.DOI; -import org.jabref.model.entry.types.StandardEntryType; -import org.jabref.model.paging.Page; -import org.jabref.model.strings.StringUtil; -import org.jabref.model.util.OptionalUtil; - -import org.apache.http.client.utils.URIBuilder; -import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.xml.sax.SAXException; - -/** - * Fetcher for the arXiv. - * - * @see ArXiv API for an overview of the API - * @see ArXiv API User's Manual for a detailed - * description on how to use the API - *

- * Similar implementions: - * arxiv2bib which is live - * dspace-portalmec - */ -public class ArXiv implements FulltextFetcher, PagedSearchBasedFetcher, IdBasedFetcher, IdFetcher { - - private static final Logger LOGGER = LoggerFactory.getLogger(ArXiv.class); - - private static final String API_URL = "https://export.arxiv.org/api/query"; - - private final ImportFormatPreferences importFormatPreferences; - - public ArXiv(ImportFormatPreferences importFormatPreferences) { - this.importFormatPreferences = importFormatPreferences; - } - - @Override - public Optional findFullText(BibEntry entry) throws IOException { - Objects.requireNonNull(entry); - - try { - Optional pdfUrl = searchForEntries(entry).stream() - .map(ArXivEntry::getPdfUrl) - .filter(Optional::isPresent) - .map(Optional::get) - .findFirst(); - pdfUrl.ifPresent(url -> LOGGER.info("Fulltext PDF found @ arXiv.")); - return pdfUrl; - } catch (FetcherException e) { - LOGGER.warn("arXiv API request failed", e); - } - - return Optional.empty(); - } - - @Override - public TrustLevel getTrustLevel() { - return TrustLevel.PREPRINT; - } - - private Optional searchForEntry(String searchQuery) throws FetcherException { - List entries = queryApi(searchQuery, Collections.emptyList(), 0, 1); - if (entries.size() == 1) { - return Optional.of(entries.get(0)); - } else { - return Optional.empty(); - } - } - - private Optional searchForEntryById(String id) throws FetcherException { - Optional identifier = ArXivIdentifier.parse(id); - if (identifier.isEmpty()) { - return Optional.empty(); - } - - List entries = queryApi("", Collections.singletonList(identifier.get()), 0, 1); - if (entries.size() >= 1) { - return Optional.of(entries.get(0)); - } else { - return Optional.empty(); - } - } - - private List searchForEntries(BibEntry originalEntry) throws FetcherException { - // We need to clone the entry, because we modify it by a cleanup job. - final BibEntry entry = (BibEntry) originalEntry.clone(); - - // 1. Check for Eprint - new EprintCleanup().cleanup(entry); - Optional identifier = entry.getField(StandardField.EPRINT); - if (StringUtil.isNotBlank(identifier)) { - try { - // Get pdf of entry with the specified id - return OptionalUtil.toList(searchForEntryById(identifier.get())); - } catch (FetcherException e) { - LOGGER.warn("arXiv eprint API request failed", e); - } - } - - // 2. DOI and other fields - String query = entry.getField(StandardField.DOI) - .flatMap(DOI::parse) - .map(DOI::getNormalized) - .map(doiString -> "doi:" + doiString) - .orElseGet(() -> { - Optional authorQuery = entry.getField(StandardField.AUTHOR).map(author -> "au:" + author); - Optional titleQuery = entry.getField(StandardField.TITLE).map(title -> "ti:" + StringUtil.ignoreCurlyBracket(title)); - return String.join("+AND+", OptionalUtil.toList(authorQuery, titleQuery)); - }); - Optional arxivEntry = searchForEntry(query); - if (arxivEntry.isPresent()) { - // Check if entry is a match - StringSimilarity match = new StringSimilarity(); - String arxivTitle = arxivEntry.get().title.orElse(""); - String entryTitle = StringUtil.ignoreCurlyBracket(entry.getField(StandardField.TITLE).orElse("")); - if (match.isSimilar(arxivTitle, entryTitle)) { - return OptionalUtil.toList(arxivEntry); - } - } - - return Collections.emptyList(); - } - - private List searchForEntries(String searchQuery, int pageNumber) throws FetcherException { - return queryApi(searchQuery, Collections.emptyList(), getPageSize() * pageNumber, getPageSize()); - } - - private List queryApi(String searchQuery, List ids, int start, int maxResults) - throws FetcherException { - Document result = callApi(searchQuery, ids, start, maxResults); - List entries = XMLUtil.asList(result.getElementsByTagName("entry")); - - return entries.stream().map(ArXivEntry::new).collect(Collectors.toList()); - } - - /** - * Queries the API. - *

- * If only {@code searchQuery} is given, then the API will return results for each article that matches the query. - * If only {@code ids} is given, then the API will return results for each article in the list. - * If both {@code searchQuery} and {@code ids} are given, then the API will return each article in - * {@code ids} that matches {@code searchQuery}. This allows the API to act as a results filter. - * - * @param searchQuery the search query used to find articles; - * details - * @param ids a list of arXiv identifiers - * @param start the index of the first returned result (zero-based) - * @param maxResults the number of maximal results (has to be smaller than 2000) - * @return the response from the API as a XML document (Atom 1.0) - * @throws FetcherException if there was a problem while building the URL or the API was not accessible - */ - private Document callApi(String searchQuery, List ids, int start, int maxResults) throws FetcherException { - if (maxResults > 2000) { - throw new IllegalArgumentException("The arXiv API limits the number of maximal results to be 2000"); - } - - try { - URIBuilder uriBuilder = new URIBuilder(API_URL); - // The arXiv API has problems with accents, so we remove them (i.e. Fréchet -> Frechet) - if (StringUtil.isNotBlank(searchQuery)) { - uriBuilder.addParameter("search_query", StringUtil.stripAccents(searchQuery)); - } - if (!ids.isEmpty()) { - uriBuilder.addParameter("id_list", - ids.stream().map(ArXivIdentifier::getNormalized).collect(Collectors.joining(","))); - } - uriBuilder.addParameter("start", String.valueOf(start)); - uriBuilder.addParameter("max_results", String.valueOf(maxResults)); - URL url = uriBuilder.build().toURL(); - - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - DocumentBuilder builder = factory.newDocumentBuilder(); - - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - if (connection.getResponseCode() == 400) { - // Bad request error from server, try to get more information - throw getException(builder.parse(connection.getErrorStream())); - } else { - return builder.parse(connection.getInputStream()); - } - } catch (SAXException | ParserConfigurationException | IOException | URISyntaxException exception) { - throw new FetcherException("arXiv API request failed", exception); - } - } - - private FetcherException getException(Document error) { - List entries = XMLUtil.asList(error.getElementsByTagName("entry")); - - // Check if the API returned an error - // In case of an error, only one entry will be returned with the error information. For example: - // https://export.arxiv.org/api/query?id_list=0307015 - // - // https://arxiv.org/api/errors#incorrect_id_format_for_0307015 - // Error - //

incorrect id format for 0307015 - // - if (entries.size() == 1) { - Node node = entries.get(0); - Optional id = XMLUtil.getNodeContent(node, "id"); - Boolean isError = id.map(idContent -> idContent.startsWith("http://arxiv.org/api/errors")).orElse(false); - if (isError) { - String errorMessage = XMLUtil.getNodeContent(node, "summary").orElse("Unknown error"); - return new FetcherException(errorMessage); - } - } - return new FetcherException("arXiv API request failed"); - } - - @Override - public String getName() { - return "ArXiv"; - } - - @Override - public Optional getHelpPage() { - return Optional.of(HelpFile.FETCHER_OAI2_ARXIV); - } - - /** - * Constructs a complex query string using the field prefixes specified at https://arxiv.org/help/api/user-manual - * - * @param luceneQuery the root node of the lucene query - * @return A list of entries matching the complex query - */ - @Override - public Page performSearchPaged(QueryNode luceneQuery, int pageNumber) throws FetcherException { - ArXivQueryTransformer transformer = new ArXivQueryTransformer(); - String transformedQuery = transformer.transformLuceneQuery(luceneQuery).orElse(""); - List searchResult = searchForEntries(transformedQuery, pageNumber).stream() - .map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())) - .collect(Collectors.toList()); - return new Page<>(transformedQuery, pageNumber, filterYears(searchResult, transformer)); - } - - private List filterYears(List searchResult, ArXivQueryTransformer transformer) { - return searchResult.stream() - .filter(entry -> entry.getField(StandardField.DATE).isPresent()) - // Filter the date field for year only - .filter(entry -> transformer.getEndYear().isEmpty() || Integer.parseInt(entry.getField(StandardField.DATE).get().substring(0, 4)) <= transformer.getEndYear().get()) - .filter(entry -> transformer.getStartYear().isEmpty() || Integer.parseInt(entry.getField(StandardField.DATE).get().substring(0, 4)) >= transformer.getStartYear().get()) - .collect(Collectors.toList()); - } - - @Override - public Optional performSearchById(String identifier) throws FetcherException { - return searchForEntryById(identifier) - .map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())); - } - - @Override - public Optional findIdentifier(BibEntry entry) throws FetcherException { - return searchForEntries(entry).stream() - .map(ArXivEntry::getId) - .filter(Optional::isPresent) - .map(Optional::get) - .findFirst(); - } - - @Override - public String getIdentifierName() { - return "ArXiv"; - } - - private static class ArXivEntry { - - private final Optional title; - private final Optional urlAbstractPage; - private final Optional publishedDate; - private final Optional abstractText; - private final List authorNames; - private final List categories; - private final Optional pdfUrl; - private final Optional doi; - private final Optional journalReferenceText; - private final Optional primaryCategory; - - public ArXivEntry(Node item) { - // see https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned - - // Title of the article - // The result from the arXiv contains hard line breaks, try to remove them - title = XMLUtil.getNodeContent(item, "title").map(ArXivEntry::correctLineBreaks); - - // The url leading to the abstract page - urlAbstractPage = XMLUtil.getNodeContent(item, "id"); - - // Date on which the first version was published - publishedDate = XMLUtil.getNodeContent(item, "published"); - - // Abstract of the article - abstractText = XMLUtil.getNodeContent(item, "summary").map(ArXivEntry::correctLineBreaks) - .map(String::trim); - - // Authors of the article - authorNames = new ArrayList<>(); - for (Node authorNode : XMLUtil.getNodesByName(item, "author")) { - Optional authorName = XMLUtil.getNodeContent(authorNode, "name").map(String::trim); - authorName.ifPresent(authorNames::add); - } - - // Categories (arXiv, ACM, or MSC classification) - categories = new ArrayList<>(); - for (Node categoryNode : XMLUtil.getNodesByName(item, "category")) { - Optional category = XMLUtil.getAttributeContent(categoryNode, "term"); - category.ifPresent(categories::add); - } - - // Links - Optional pdfUrlParsed = Optional.empty(); - for (Node linkNode : XMLUtil.getNodesByName(item, "link")) { - Optional linkTitle = XMLUtil.getAttributeContent(linkNode, "title"); - if (linkTitle.equals(Optional.of("pdf"))) { - pdfUrlParsed = XMLUtil.getAttributeContent(linkNode, "href").map(url -> { - try { - return new URL(url); - } catch (MalformedURLException e) { - return null; - } - }); - } - } - pdfUrl = pdfUrlParsed; - - // Associated DOI - doi = XMLUtil.getNodeContent(item, "arxiv:doi"); - - // Journal reference (as provided by the author) - journalReferenceText = XMLUtil.getNodeContent(item, "arxiv:journal_ref"); - - // Primary category - // Ex: - primaryCategory = XMLUtil.getNode(item, "arxiv:primary_category") - .flatMap(node -> XMLUtil.getAttributeContent(node, "term")); - } - - public static String correctLineBreaks(String s) { - String result = s.replaceAll("\\n(?!\\s*\\n)", " "); - result = result.replaceAll("\\s*\\n\\s*", "\n"); - return result.replaceAll(" {2,}", " ").replaceAll("(^\\s*|\\s+$)", ""); - } - - /** - * Returns the url of the linked pdf - */ - public Optional getPdfUrl() { - return pdfUrl; - } - - /** - * Returns the arXiv identifier - */ - public Optional getIdString() { - return urlAbstractPage.flatMap(ArXivIdentifier::parse).map(ArXivIdentifier::getNormalizedWithoutVersion); - } - - public Optional getId() { - return getIdString().flatMap(ArXivIdentifier::parse); - } - - /** - * Returns the date when the first version was put on the arXiv - */ - public Optional getDate() { - // Publication string also contains time, e.g. 2014-05-09T14:49:43Z - return publishedDate.map(date -> { - if (date.length() < 10) { - return null; - } else { - return date.substring(0, 10); - } - }); - } - - public BibEntry toBibEntry(Character keywordDelimiter) { - BibEntry bibEntry = new BibEntry(StandardEntryType.Article); - bibEntry.setField(StandardField.EPRINTTYPE, "arXiv"); - bibEntry.setField(StandardField.AUTHOR, String.join(" and ", authorNames)); - bibEntry.addKeywords(categories, keywordDelimiter); - getIdString().ifPresent(id -> bibEntry.setField(StandardField.EPRINT, id)); - title.ifPresent(titleContent -> bibEntry.setField(StandardField.TITLE, titleContent)); - doi.ifPresent(doiContent -> bibEntry.setField(StandardField.DOI, doiContent)); - abstractText.ifPresent(abstractContent -> bibEntry.setField(StandardField.ABSTRACT, abstractContent)); - getDate().ifPresent(date -> bibEntry.setField(StandardField.DATE, date)); - primaryCategory.ifPresent(category -> bibEntry.setField(StandardField.EPRINTCLASS, category)); - journalReferenceText.ifPresent(journal -> bibEntry.setField(StandardField.JOURNALTITLE, journal)); - getPdfUrl().ifPresent(url -> bibEntry.setFiles(Collections.singletonList(new LinkedFile(url, "PDF")))); - return bibEntry; - } - } -} diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java new file mode 100644 index 00000000000..6332dc24854 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java @@ -0,0 +1,772 @@ +package org.jabref.logic.importer.fetcher; + +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.stream.Collectors; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.jabref.logic.cleanup.EprintCleanup; +import org.jabref.logic.help.HelpFile; +import org.jabref.logic.importer.FetcherException; +import org.jabref.logic.importer.FulltextFetcher; +import org.jabref.logic.importer.IdBasedFetcher; +import org.jabref.logic.importer.IdFetcher; +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.PagedSearchBasedFetcher; +import org.jabref.logic.importer.fetcher.transformers.ArXivQueryTransformer; +import org.jabref.logic.util.io.XMLUtil; +import org.jabref.logic.util.strings.StringSimilarity; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.KeywordList; +import org.jabref.model.entry.LinkedFile; +import org.jabref.model.entry.field.Field; +import org.jabref.model.entry.field.InternalField; +import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.model.entry.identifier.DOI; +import org.jabref.model.entry.types.StandardEntryType; +import org.jabref.model.paging.Page; +import org.jabref.model.strings.StringUtil; +import org.jabref.model.util.OptionalUtil; + +import com.google.common.collect.ImmutableMap; +import org.apache.http.client.utils.URIBuilder; +import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.xml.sax.SAXException; + +/** + * Fetcher for ArXiv that merges fields from arXiv-issued DOIs (and user-issued ones when applicable) to get more information overall. + *

+ * These are the post-processing steps applied to the original fetch from ArXiv's API: + *

    + *
  1. Use ArXiv-issued DOI to get more merge more data with original entry, overwriting some of those fields;
  2. + *
  3. Use user-issued DOI (if it was provided) to merge even more data with the result of the previous step, overwriting some of those fields;
  4. + *
  5. Modify keywords: remove repetitions and adapt some edge cases (commas in keyword transformed into forward slashes).
  6. + *
+ * + * @see arXiv.org blog for more info about arXiv-issued DOIs + * @see ArXiv API for an overview of the API + * @see ArXiv API User's Manual for a detailed description on how to use the API + */ +public class ArXivFetcher implements FulltextFetcher, PagedSearchBasedFetcher, IdBasedFetcher, IdFetcher { + + private static final Logger LOGGER = LoggerFactory.getLogger(ArXivFetcher.class); + + // See https://blog.arxiv.org/2022/02/17/new-arxiv-articles-are-now-automatically-assigned-dois/ + private static final String DOI_PREFIX = "10.48550/arXiv."; + + /* + * Reason behind choice of these fields: + * - KEYWORDS: More descriptive + * - AUTHOR: Better formatted (last name, rest of name) + */ + private static final Set CHOSEN_AUTOMATIC_DOI_FIELDS = Set.of(StandardField.KEYWORDS, StandardField.AUTHOR); + + /* + * Reason behind choice of these fields: + * - DOI: give preference to DOIs manually inputted by users, instead of automatic ones + * - PUBLISHER: ArXiv-issued DOIs give 'ArXiv' as entry publisher. While this can be true, prefer using one from external sources, + * if applicable + * - KEY_FIELD: Usually, the KEY_FIELD retrieved from user-assigned DOIs are 'nicer' (instead of a DOI link, it's usually contains one author and the year) + */ + private static final Set CHOSEN_MANUAL_DOI_FIELDS = Set.of(StandardField.DOI, StandardField.PUBLISHER, InternalField.KEY_FIELD); + + private static final Map ARXIV_KEYWORDS_WITH_COMMA_REPLACEMENTS = ImmutableMap.of( + "Computational Engineering, Finance, and Science", "Computational Engineering / Finance / Science", + "Distributed, Parallel, and Cluster Computing", "Distributed / Parallel / Cluster Computing"); + + private final ArXiv arXiv; + private final DoiFetcher doiFetcher; + private final ImportFormatPreferences importFormatPreferences; + + public ArXivFetcher(ImportFormatPreferences importFormatPreferences) { + this(importFormatPreferences, new DoiFetcher(importFormatPreferences)); + } + + /** + * @param doiFetcher The fetcher, maybe be NULL if no additional search is desired. + */ + public ArXivFetcher(ImportFormatPreferences importFormatPreferences, DoiFetcher doiFetcher) { + this.arXiv = new ArXiv(importFormatPreferences); + this.doiFetcher = doiFetcher; + this.importFormatPreferences = importFormatPreferences; + } + + @Override + public Optional findFullText(BibEntry entry) throws IOException { + return arXiv.findFullText(entry); + } + + @Override + public TrustLevel getTrustLevel() { + return arXiv.getTrustLevel(); + } + + @Override + public String getName() { + return arXiv.getName(); + } + + @Override + public Optional getHelpPage() { + return arXiv.getHelpPage(); + } + + /** + * Remove duplicate values on "KEYWORD" field, if any. Al + * + * @param bibEntry A BibEntry to modify + */ + private void adaptKeywordsFrom(BibEntry bibEntry) { + Optional allKeywords = bibEntry.getField(StandardField.KEYWORDS); + if (allKeywords.isPresent()) { + // With the use of ArXiv-issued DOI's KEYWORDS field, some of those keywords might contain comma. As this is the + // default keyword separator, replace the commas of these instances with some other character + // (see ARXIV_KEYWORDS_WITH_COMMA_REPLACEMENTS variable) + for (Map.Entry entry : ARXIV_KEYWORDS_WITH_COMMA_REPLACEMENTS.entrySet()) { + allKeywords = Optional.of(allKeywords.get().replaceAll(entry.getKey(), entry.getValue())); + } + + String filteredKeywords = KeywordList.merge(allKeywords.get(), "", importFormatPreferences.getKeywordSeparator()).toString(); + bibEntry.setField(StandardField.KEYWORDS, filteredKeywords); + } + } + + /** + * Get ArXiv-issued DOI from the entry's arXiv ID + *

+ * ArXiv-issued DOIs are identifiers associated with every ArXiv entry. They are composed of a fixed + * {@link #DOI_PREFIX} + the entry's ArXiv ID + * + * @param arXivId An ArXiv ID + * @return ArXiv-issued DOI + */ + private static String getAutomaticDoi(String arXivId) { + return DOI_PREFIX + arXivId; + } + + /** + * Get ArXiv-issued DOI from the arXiv entry itself. + *

+ * ArXiv-issued DOIs are identifiers associated with every ArXiv entry. They are composed of a fixed {@link #DOI_PREFIX} + the entry's ArXiv ID + * + * @param arXivBibEntry A Bibtex Entry, formatted as a ArXiv entry. Must contain an EPRINT field + * @return ArXiv-issued DOI, or Empty, if method could not retrieve it + */ + private static Optional getAutomaticDoi(BibEntry arXivBibEntry) { + // As the input should always contain a EPRINT if created from inner 'ArXiv' class, don't bother doing a check that might call + // ArXiv's API again (method 'findIdentifier') + Optional entryEprint = arXivBibEntry.getField(StandardField.EPRINT); + if (entryEprint.isEmpty()) { + LOGGER.error("Cannot infer ArXiv-issued DOI from BibEntry: no 'EPRINT' field found"); + return Optional.empty(); + } else { + return Optional.of(ArXivFetcher.getAutomaticDoi(entryEprint.get())); + } + } + + /** + * Get ArXiv-issued DOI from ArXiv Identifier object + *

+ * ArXiv-issued DOIs are identifiers associated with every ArXiv entry. They are composed of a fixed {@link #DOI_PREFIX} + the entry's ArXiv ID + * + * @param arXivId An ArXiv ID as internal object + * @return ArXiv-issued DOI + */ + private static String getAutomaticDoi(ArXivIdentifier arXivId) { + return getAutomaticDoi(arXivId.getNormalizedWithoutVersion()); + } + + /** + * Check if a specific DOI is user-assigned. + */ + private static boolean isManualDoi(String doi) { + return !doi.toLowerCase().contains(DOI_PREFIX.toLowerCase()); + } + + /** + * Get user-issued DOI from ArXiv Bibtex entry, if any + *

+ * User-issued DOIs are identifiers associated with some ArXiv entries that can associate an entry with an external service, like + * Springer Link. + * + * @param arXivBibEntry An ArXiv Bibtex entry from where the DOI is extracted + * @return User-issued DOI, if any field exists and if it's not an automatic one (see {@link #getAutomaticDoi(ArXivIdentifier)}) + */ + private static Optional getManualDoi(BibEntry arXivBibEntry) { + return arXivBibEntry.getField(StandardField.DOI).filter(ArXivFetcher::isManualDoi); + } + + /** + * Get the Bibtex Entry from a Future API request (uses blocking) and treat exceptions. + * + * @param bibEntryFuture A CompletableFuture that parallelize the API fetching process + * @return the fetch result + */ + private static Optional waitForBibEntryRetrieval(CompletableFuture> bibEntryFuture) throws FetcherException { + try { + return bibEntryFuture.join(); + } catch (CompletionException e) { + if (!(e.getCause() instanceof FetcherException)) { + LOGGER.error("The supplied future should only throw a FetcherException.", e); + throw e; + } + throw (FetcherException) e.getCause(); + } + } + + /** + * Eventually merge the ArXiv Bibtex entry with a Future Bibtex entry (ArXiv/user-assigned DOIs) + * + * @param arXivEntry The entry to merge into + * @param bibEntryFuture A future result of the fetching process + * @param priorityFields Which fields from "bibEntryFuture" to prioritize, replacing them on "arXivEntry" + * @param id Identifier used in initiating the "bibEntryFuture" future (for logging). This is usually a DOI, but can be anything. + */ + private void mergeArXivEntryWithFutureDoiEntry(BibEntry arXivEntry, CompletableFuture> bibEntryFuture, Set priorityFields, String id) { + Optional bibEntry; + try { + bibEntry = waitForBibEntryRetrieval(bibEntryFuture); + } catch (FetcherException | CompletionException e) { + LOGGER.error("Failed to fetch future BibEntry with id '{}' (skipping merge).", id, e); + return; + } + + if (bibEntry.isPresent()) { + adaptKeywordsFrom(bibEntry.get()); + arXivEntry.mergeWith(bibEntry.get(), priorityFields); + } else { + LOGGER.error("Future BibEntry for id '{}' was completed, but no entry was found (skipping merge).", id); + } + } + + /** + * Infuse arXivBibEntryPromise with additional fields in an asynchronous way + * + * @param arXivBibEntry An existing entry to be updated with new/modified fields + */ + private void inplaceAsyncInfuseArXivWithDoi(BibEntry arXivBibEntry) { + CompletableFuture> arXivBibEntryCompletedFuture = CompletableFuture.completedFuture(Optional.of(arXivBibEntry)); + Optional arXivBibEntryId = arXivBibEntry.getField(StandardField.EPRINT).flatMap(ArXivIdentifier::parse); + + try { + this.inplaceAsyncInfuseArXivWithDoi(arXivBibEntryCompletedFuture, arXivBibEntryId); + } catch (FetcherException e) { + LOGGER.error("FetcherException should not be found here, as main Bibtex Entry already exists " + + "(and failing additional fetches should be skipped)", e); + } + } + + /** + * Infuse arXivBibEntryFuture with additional fields in an asynchronous way, accelerating the process by providing a valid ArXiv ID + * + * @param arXivBibEntryFuture A future entry that (if it exists) will be updated with new/modified fields + * @param arXivId An ArXiv ID for the main reference (from ArXiv), so that the retrieval of ArXiv-issued DOI metadata can be faster + * @throws FetcherException when failed to fetch the main ArtXiv Bibtex entry ('arXivBibEntryFuture'). + */ + private void inplaceAsyncInfuseArXivWithDoi(CompletableFuture> arXivBibEntryFuture, Optional arXivId) throws FetcherException { + + Optional>> automaticDoiBibEntryFuture; + Optional arXivBibEntry; + + Optional automaticDoi; + Optional manualDoi; + + // We can accelerate the processing time by initiating a parallel request for DOIFetcher with an ArXiv-issued DOI alongside the ArXiv fetching itself, + // BUT ONLY IF we have a valid arXivId. If not, the ArXiv entry must be retrieved before, which invalidates this optimization (although we can still speed + // up the process by running both the ArXiv-assigned and user-assigned DOI fetching at the same time, if an entry has this last information) + if (arXivId.isPresent()) { + automaticDoi = Optional.of(ArXivFetcher.getAutomaticDoi(arXivId.get())); + automaticDoiBibEntryFuture = Optional.of(doiFetcher.asyncPerformSearchById(automaticDoi.get())); + + arXivBibEntry = ArXivFetcher.waitForBibEntryRetrieval(arXivBibEntryFuture); + if (arXivBibEntry.isEmpty()) { + return; + } + } else { + // If ArXiv fetch fails (FetcherException), exception must be passed onwards for the transparency of this class (original ArXiv fetcher does the same) + arXivBibEntry = ArXivFetcher.waitForBibEntryRetrieval(arXivBibEntryFuture); + if (arXivBibEntry.isEmpty()) { + return; + } + + automaticDoi = ArXivFetcher.getAutomaticDoi(arXivBibEntry.get()); + automaticDoiBibEntryFuture = automaticDoi.map(arXiv::asyncPerformSearchById); + } + + manualDoi = ArXivFetcher.getManualDoi(arXivBibEntry.get()); + Optional>> manualDoiBibEntryFuture = manualDoi.map(doiFetcher::asyncPerformSearchById); + + automaticDoiBibEntryFuture.ifPresent(future -> + mergeArXivEntryWithFutureDoiEntry(arXivBibEntry.get(), future, CHOSEN_AUTOMATIC_DOI_FIELDS, automaticDoi.get())); + manualDoiBibEntryFuture.ifPresent(future -> + mergeArXivEntryWithFutureDoiEntry(arXivBibEntry.get(), future, CHOSEN_MANUAL_DOI_FIELDS, manualDoi.get())); + } + + /** + * Constructs a complex query string using the field prefixes specified at https://arxiv.org/help/api/user-manual + * and modify resulting BibEntries with additional info from the ArXiv-issued DOI + * + * @param luceneQuery the root node of the lucene query + * @return A list of entries matching the complex query + */ + @Override + public Page performSearchPaged(QueryNode luceneQuery, int pageNumber) throws FetcherException { + + Page result = arXiv.performSearchPaged(luceneQuery, pageNumber); + if (this.doiFetcher == null) { + return result; + } + + ExecutorService executor = Executors.newFixedThreadPool(getPageSize() * 2); + + Collection> futureSearchResult = result.getContent() + .stream() + .map(bibEntry -> + CompletableFuture.supplyAsync(() -> { + this.inplaceAsyncInfuseArXivWithDoi(bibEntry); + return bibEntry; + }, executor)) + .toList(); + + Collection modifiedSearchResult = futureSearchResult.stream() + .map(CompletableFuture::join) + .collect(Collectors.toList()); + + return new Page<>(result.getQuery(), result.getPageNumber(), modifiedSearchResult); + } + + @Override + public Optional performSearchById(String identifier) throws FetcherException { + CompletableFuture> arXivBibEntryPromise = arXiv.asyncPerformSearchById(identifier); + if (this.doiFetcher != null) { + inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, ArXivIdentifier.parse(identifier)); + } + return arXivBibEntryPromise.join(); + } + + @Override + public Optional findIdentifier(BibEntry entry) throws FetcherException { + return arXiv.findIdentifier(entry); + } + + @Override + public String getIdentifierName() { + return arXiv.getIdentifierName(); + } + + /** + * Fetcher for the arXiv. + * + * @see ArXiv API for an overview of the API + * @see ArXiv API User's Manual for a detailed + * description on how to use the API + *

+ * Similar implementions: + * arxiv2bib which is live + * dspace-portalmec + */ + protected class ArXiv implements FulltextFetcher, PagedSearchBasedFetcher, IdBasedFetcher, IdFetcher { + + private static final Logger LOGGER = LoggerFactory.getLogger(org.jabref.logic.importer.fetcher.ArXivFetcher.ArXiv.class); + + private static final String API_URL = "https://export.arxiv.org/api/query"; + + private final ImportFormatPreferences importFormatPreferences; + + public ArXiv(ImportFormatPreferences importFormatPreferences) { + this.importFormatPreferences = importFormatPreferences; + } + + @Override + public Optional findFullText(BibEntry entry) throws IOException { + Objects.requireNonNull(entry); + + try { + Optional pdfUrl = searchForEntries(entry).stream() + .map(ArXivEntry::getPdfUrl) + .filter(Optional::isPresent) + .map(Optional::get) + .findFirst(); + pdfUrl.ifPresent(url -> LOGGER.info("Fulltext PDF found @ arXiv.")); + return pdfUrl; + } catch (FetcherException e) { + LOGGER.warn("arXiv API request failed", e); + } + + return Optional.empty(); + } + + @Override + public TrustLevel getTrustLevel() { + return TrustLevel.PREPRINT; + } + + private Optional searchForEntry(String searchQuery) throws FetcherException { + List entries = queryApi(searchQuery, Collections.emptyList(), 0, 1); + if (entries.size() == 1) { + return Optional.of(entries.get(0)); + } else { + return Optional.empty(); + } + } + + private Optional searchForEntryById(String id) throws FetcherException { + Optional identifier = ArXivIdentifier.parse(id); + if (identifier.isEmpty()) { + return Optional.empty(); + } + + List entries = queryApi("", Collections.singletonList(identifier.get()), 0, 1); + if (entries.size() >= 1) { + return Optional.of(entries.get(0)); + } else { + return Optional.empty(); + } + } + + private List searchForEntries(BibEntry originalEntry) throws FetcherException { + // We need to clone the entry, because we modify it by a cleanup job. + final BibEntry entry = (BibEntry) originalEntry.clone(); + + // 1. Check for Eprint + new EprintCleanup().cleanup(entry); + Optional identifier = entry.getField(StandardField.EPRINT); + if (StringUtil.isNotBlank(identifier)) { + try { + // Get pdf of entry with the specified id + return OptionalUtil.toList(searchForEntryById(identifier.get())); + } catch (FetcherException e) { + LOGGER.warn("arXiv eprint API request failed", e); + } + } + + // 2. DOI and other fields + String query; + Optional doiString = entry.getField(StandardField.DOI) + .flatMap(DOI::parse) + .map(DOI::getNormalized); + + // ArXiv-issued DOIs seem to be unsearchable from ArXiv API's "query string", so ignore it + if (doiString.isPresent() && ArXivFetcher.isManualDoi(doiString.get())) { + query = "doi:" + doiString.get(); + } else { + Optional authorQuery = entry.getField(StandardField.AUTHOR).map(author -> "au:" + author); + Optional titleQuery = entry.getField(StandardField.TITLE).map(title -> "ti:" + StringUtil.ignoreCurlyBracket(title)); + query = String.join("+AND+", OptionalUtil.toList(authorQuery, titleQuery)); + } + + Optional arxivEntry = searchForEntry(query); + if (arxivEntry.isPresent()) { + // Check if entry is a match + StringSimilarity match = new StringSimilarity(); + String arxivTitle = arxivEntry.get().title.orElse(""); + String entryTitle = StringUtil.ignoreCurlyBracket(entry.getField(StandardField.TITLE).orElse("")); + if (match.isSimilar(arxivTitle, entryTitle)) { + return OptionalUtil.toList(arxivEntry); + } + } + + return Collections.emptyList(); + } + + private List searchForEntries(String searchQuery, int pageNumber) throws FetcherException { + return queryApi(searchQuery, Collections.emptyList(), getPageSize() * pageNumber, getPageSize()); + } + + private List queryApi(String searchQuery, List ids, int start, int maxResults) + throws FetcherException { + Document result = callApi(searchQuery, ids, start, maxResults); + List entries = XMLUtil.asList(result.getElementsByTagName("entry")); + + return entries.stream().map(ArXivEntry::new).collect(Collectors.toList()); + } + + /** + * Queries the API. + *

+ * If only {@code searchQuery} is given, then the API will return results for each article that matches the query. + * If only {@code ids} is given, then the API will return results for each article in the list. + * If both {@code searchQuery} and {@code ids} are given, then the API will return each article in + * {@code ids} that matches {@code searchQuery}. This allows the API to act as a results filter. + * + * @param searchQuery the search query used to find articles; + * details + * @param ids a list of arXiv identifiers + * @param start the index of the first returned result (zero-based) + * @param maxResults the number of maximal results (has to be smaller than 2000) + * @return the response from the API as a XML document (Atom 1.0) + * @throws FetcherException if there was a problem while building the URL or the API was not accessible + */ + private Document callApi(String searchQuery, List ids, int start, int maxResults) throws FetcherException { + if (maxResults > 2000) { + throw new IllegalArgumentException("The arXiv API limits the number of maximal results to be 2000"); + } + + try { + URIBuilder uriBuilder = new URIBuilder(API_URL); + // The arXiv API has problems with accents, so we remove them (i.e. Fréchet -> Frechet) + if (StringUtil.isNotBlank(searchQuery)) { + uriBuilder.addParameter("search_query", StringUtil.stripAccents(searchQuery)); + } + if (!ids.isEmpty()) { + uriBuilder.addParameter("id_list", + ids.stream().map(ArXivIdentifier::getNormalized).collect(Collectors.joining(","))); + } + uriBuilder.addParameter("start", String.valueOf(start)); + uriBuilder.addParameter("max_results", String.valueOf(maxResults)); + URL url = uriBuilder.build().toURL(); + + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + if (connection.getResponseCode() == 400) { + // Bad request error from server, try to get more information + throw getException(builder.parse(connection.getErrorStream())); + } else { + return builder.parse(connection.getInputStream()); + } + } catch (SAXException | ParserConfigurationException | IOException | URISyntaxException exception) { + throw new FetcherException("arXiv API request failed", exception); + } + } + + private FetcherException getException(Document error) { + List entries = XMLUtil.asList(error.getElementsByTagName("entry")); + + // Check if the API returned an error + // In case of an error, only one entry will be returned with the error information. For example: + // https://export.arxiv.org/api/query?id_list=0307015 + // + // https://arxiv.org/api/errors#incorrect_id_format_for_0307015 + // Error + //

incorrect id format for 0307015 + // + if (entries.size() == 1) { + Node node = entries.get(0); + Optional id = XMLUtil.getNodeContent(node, "id"); + Boolean isError = id.map(idContent -> idContent.startsWith("http://arxiv.org/api/errors")).orElse(false); + if (isError) { + String errorMessage = XMLUtil.getNodeContent(node, "summary").orElse("Unknown error"); + return new FetcherException(errorMessage); + } + } + return new FetcherException("arXiv API request failed"); + } + + @Override + public String getName() { + return "ArXiv"; + } + + @Override + public Optional getHelpPage() { + return Optional.of(HelpFile.FETCHER_OAI2_ARXIV); + } + + /** + * Constructs a complex query string using the field prefixes specified at https://arxiv.org/help/api/user-manual + * + * @param luceneQuery the root node of the lucene query + * @return A list of entries matching the complex query + */ + @Override + public Page performSearchPaged(QueryNode luceneQuery, int pageNumber) throws FetcherException { + ArXivQueryTransformer transformer = new ArXivQueryTransformer(); + String transformedQuery = transformer.transformLuceneQuery(luceneQuery).orElse(""); + List searchResult = searchForEntries(transformedQuery, pageNumber).stream() + .map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())) + .collect(Collectors.toList()); + return new Page<>(transformedQuery, pageNumber, filterYears(searchResult, transformer)); + } + + private List filterYears(List searchResult, ArXivQueryTransformer transformer) { + return searchResult.stream() + .filter(entry -> entry.getField(StandardField.DATE).isPresent()) + // Filter the date field for year only + .filter(entry -> transformer.getEndYear().isEmpty() || Integer.parseInt(entry.getField(StandardField.DATE).get().substring(0, 4)) <= transformer.getEndYear().get()) + .filter(entry -> transformer.getStartYear().isEmpty() || Integer.parseInt(entry.getField(StandardField.DATE).get().substring(0, 4)) >= transformer.getStartYear().get()) + .collect(Collectors.toList()); + } + + protected CompletableFuture> asyncPerformSearchById(String identifier) throws CompletionException { + return CompletableFuture.supplyAsync(() -> { + try { + return performSearchById(identifier); + } catch (FetcherException e) { + throw new CompletionException(e); + } + }); + } + + @Override + public Optional performSearchById(String identifier) throws FetcherException { + return searchForEntryById(identifier) + .map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())); + } + + @Override + public Optional findIdentifier(BibEntry entry) throws FetcherException { + return searchForEntries(entry).stream() + .map(ArXivEntry::getId) + .filter(Optional::isPresent) + .map(Optional::get) + .findFirst(); + } + + @Override + public String getIdentifierName() { + return "ArXiv"; + } + + private static class ArXivEntry { + + private final Optional title; + private final Optional urlAbstractPage; + private final Optional publishedDate; + private final Optional abstractText; + private final List authorNames; + private final List categories; + private final Optional pdfUrl; + private final Optional doi; + private final Optional journalReferenceText; + private final Optional primaryCategory; + + public ArXivEntry(Node item) { + // see https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned + + // Title of the article + // The result from the arXiv contains hard line breaks, try to remove them + title = XMLUtil.getNodeContent(item, "title").map(ArXivEntry::correctLineBreaks); + + // The url leading to the abstract page + urlAbstractPage = XMLUtil.getNodeContent(item, "id"); + + // Date on which the first version was published + publishedDate = XMLUtil.getNodeContent(item, "published"); + + // Abstract of the article + abstractText = XMLUtil.getNodeContent(item, "summary").map(ArXivEntry::correctLineBreaks) + .map(String::trim); + + // Authors of the article + authorNames = new ArrayList<>(); + for (Node authorNode : XMLUtil.getNodesByName(item, "author")) { + Optional authorName = XMLUtil.getNodeContent(authorNode, "name").map(String::trim); + authorName.ifPresent(authorNames::add); + } + + // Categories (arXiv, ACM, or MSC classification) + categories = new ArrayList<>(); + for (Node categoryNode : XMLUtil.getNodesByName(item, "category")) { + Optional category = XMLUtil.getAttributeContent(categoryNode, "term"); + category.ifPresent(categories::add); + } + + // Links + Optional pdfUrlParsed = Optional.empty(); + for (Node linkNode : XMLUtil.getNodesByName(item, "link")) { + Optional linkTitle = XMLUtil.getAttributeContent(linkNode, "title"); + if (linkTitle.equals(Optional.of("pdf"))) { + pdfUrlParsed = XMLUtil.getAttributeContent(linkNode, "href").map(url -> { + try { + return new URL(url); + } catch (MalformedURLException e) { + return null; + } + }); + } + } + pdfUrl = pdfUrlParsed; + + // Associated DOI + doi = XMLUtil.getNodeContent(item, "arxiv:doi"); + + // Journal reference (as provided by the author) + journalReferenceText = XMLUtil.getNodeContent(item, "arxiv:journal_ref"); + + // Primary category + // Ex: + primaryCategory = XMLUtil.getNode(item, "arxiv:primary_category") + .flatMap(node -> XMLUtil.getAttributeContent(node, "term")); + } + + public static String correctLineBreaks(String s) { + String result = s.replaceAll("\\n(?!\\s*\\n)", " "); + result = result.replaceAll("\\s*\\n\\s*", "\n"); + return result.replaceAll(" {2,}", " ").replaceAll("(^\\s*|\\s+$)", ""); + } + + /** + * Returns the url of the linked pdf + */ + public Optional getPdfUrl() { + return pdfUrl; + } + + /** + * Returns the arXiv identifier + */ + public Optional getIdString() { + return urlAbstractPage.flatMap(ArXivIdentifier::parse).map(ArXivIdentifier::getNormalizedWithoutVersion); + } + + public Optional getId() { + return getIdString().flatMap(ArXivIdentifier::parse); + } + + /** + * Returns the date when the first version was put on the arXiv + */ + public Optional getDate() { + // Publication string also contains time, e.g. 2014-05-09T14:49:43Z + return publishedDate.map(date -> { + if (date.length() < 10) { + return null; + } else { + return date.substring(0, 10); + } + }); + } + + public BibEntry toBibEntry(Character keywordDelimiter) { + BibEntry bibEntry = new BibEntry(StandardEntryType.Article); + bibEntry.setField(StandardField.EPRINTTYPE, "arXiv"); + bibEntry.setField(StandardField.AUTHOR, String.join(" and ", authorNames)); + bibEntry.addKeywords(categories, keywordDelimiter); + getIdString().ifPresent(id -> bibEntry.setField(StandardField.EPRINT, id)); + title.ifPresent(titleContent -> bibEntry.setField(StandardField.TITLE, titleContent)); + doi.ifPresent(doiContent -> bibEntry.setField(StandardField.DOI, doiContent)); + abstractText.ifPresent(abstractContent -> bibEntry.setField(StandardField.ABSTRACT, abstractContent)); + getDate().ifPresent(date -> bibEntry.setField(StandardField.DATE, date)); + primaryCategory.ifPresent(category -> bibEntry.setField(StandardField.EPRINTCLASS, category)); + journalReferenceText.ifPresent(journal -> bibEntry.setField(StandardField.JOURNAL, journal)); + getPdfUrl().ifPresent(url -> bibEntry.setFiles(Collections.singletonList(new LinkedFile(url, "PDF")))); + return bibEntry; + } + } + } +} diff --git a/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java index 4f0927ef468..41bedef4c2e 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java @@ -1,10 +1,14 @@ package org.jabref.logic.importer.fetcher; import java.io.IOException; +import java.net.HttpURLConnection; import java.net.URL; +import java.net.URLConnection; import java.util.Collections; import java.util.List; import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; import java.util.regex.Pattern; import org.jabref.logic.cleanup.FieldFormatterCleanup; @@ -27,6 +31,7 @@ import org.jabref.model.util.DummyFileUpdateMonitor; import org.jabref.model.util.OptionalUtil; +import com.google.common.util.concurrent.RateLimiter; import kong.unirest.json.JSONArray; import kong.unirest.json.JSONException; import kong.unirest.json.JSONObject; @@ -43,6 +48,18 @@ public class DoiFetcher implements IdBasedFetcher, EntryBasedFetcher { private static final Logger LOGGER = LoggerFactory.getLogger(DoiFetcher.class); + // 1000 request per 5 minutes. See https://support.datacite.org/docs/is-there-a-rate-limit-for-making-requests-against-the-datacite-apis + private static final RateLimiter DATA_CITE_DCN_RATE_LIMITER = RateLimiter.create(3.33); + + /* + * By default, it seems that CrossRef DOI Content Negotiation responses are returned by their API pools, more specifically the public one + * (by default). See https://www.crossref.org/documentation/retrieve-metadata/content-negotiation/ + * Experimentally, the rating applied to this pool is defined by response headers "X-Rate-Limit-Interval" and "X-Rate-Limit-Limit", which seems + * to default to 50 request / second. However, because of its dynamic nature, this rate could change between API calls, so we need to update it + * atomically when that happens (as multiple threads might access it at the same time) + */ + private static final RateLimiter CROSSREF_DCN_RATE_LIMITER = RateLimiter.create(50.0); + private final ImportFormatPreferences preferences; public DoiFetcher(ImportFormatPreferences preferences) { @@ -59,6 +76,40 @@ public Optional getHelpPage() { return Optional.of(HelpFile.FETCHER_DOI); } + private void doAPILimiting(String identifier) { + // Without a generic API Rate Limiter implemented on the project, use Guava's RateLimiter for avoiding + // API throttling when multiple threads are working, specially during DOI Content Negotiations + Optional doi = DOI.parse(identifier); + + try { + Optional agency; + if (doi.isPresent() && (agency = getAgency(doi.get())).isPresent()) { + double waitingTime = 0.0; + if (agency.get().equalsIgnoreCase("datacite")) { + waitingTime = DATA_CITE_DCN_RATE_LIMITER.acquire(); + } else if (agency.get().equalsIgnoreCase("crossref")) { + waitingTime = CROSSREF_DCN_RATE_LIMITER.acquire(); + } // mEDRA does not explicit an API rating + + LOGGER.trace(String.format("Thread %s, searching for DOI '%s', waited %.2fs because of API rate limiter", + Thread.currentThread().getId(), identifier, waitingTime)); + } + } catch (IOException e) { + LOGGER.warn("Could not limit DOI API access rate", e); + } + } + + protected CompletableFuture> asyncPerformSearchById(String identifier) { + doAPILimiting(identifier); + return CompletableFuture.supplyAsync(() -> { + try { + return performSearchById(identifier); + } catch (FetcherException e) { + throw new CompletionException(e); + } + }); + } + @Override public Optional performSearchById(String identifier) throws FetcherException { Optional doi = DOI.parse(identifier); @@ -68,16 +119,21 @@ public Optional performSearchById(String identifier) throws FetcherExc Optional fetchedEntry; // mEDRA does not return a parsable bibtex string - if (getAgency(doi.get()).isPresent() && "medra".equalsIgnoreCase(getAgency(doi.get()).get())) { + Optional agency = getAgency(doi.get()); + if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) { return new Medra().performSearchById(identifier); } URL doiURL = new URL(doi.get().getURIAsASCIIString()); + // BibTeX data URLDownload download = getUrlDownload(doiURL); download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX); + String bibtexString; + URLConnection openConnection; try { - bibtexString = download.asString(); + openConnection = download.openConnection(); + bibtexString = URLDownload.asString(openConnection); } catch (IOException e) { // an IOException with a nested FetcherException will be thrown when you encounter a 400x or 500x http status code if (e.getCause() instanceof FetcherException fe) { @@ -90,6 +146,11 @@ public Optional performSearchById(String identifier) throws FetcherExc fetchedEntry = BibtexParser.singleFromString(bibtexString, preferences, new DummyFileUpdateMonitor()); fetchedEntry.ifPresent(this::doPostCleanup); + // Crossref has a dynamic API rate limit + if (agency.isPresent() && agency.get().equalsIgnoreCase("crossref")) { + updateCrossrefAPIRate(openConnection); + } + // Check if the entry is an APS journal and add the article id as the page count if page field is missing if (fetchedEntry.isPresent() && fetchedEntry.get().hasField(StandardField.DOI)) { BibEntry entry = fetchedEntry.get(); @@ -98,6 +159,9 @@ public Optional performSearchById(String identifier) throws FetcherExc } } + if (openConnection instanceof HttpURLConnection) { + ((HttpURLConnection) openConnection).disconnect(); + } return fetchedEntry; } else { throw new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier)); @@ -116,6 +180,25 @@ private void doPostCleanup(BibEntry entry) { new FieldFormatterCleanup(StandardField.URL, new ClearFormatter()).cleanup(entry); } + private void updateCrossrefAPIRate(URLConnection existingConnection) { + try { + // Assuming this field is given in seconds + String xRateLimitInterval = existingConnection.getHeaderField("X-Rate-Limit-Interval").replaceAll("[^\\.0123456789]", ""); + String xRateLimit = existingConnection.getHeaderField("X-Rate-Limit-Limit"); + + double newRate = Double.parseDouble(xRateLimit) / Double.parseDouble(xRateLimitInterval); + double oldRate = CROSSREF_DCN_RATE_LIMITER.getRate(); + + // In theory, the actual update might rarely happen... + if (Math.abs(newRate - oldRate) >= 1.0) { + LOGGER.info(String.format("Updated Crossref API rate limit from %.2f to %.2f", oldRate, newRate)); + CROSSREF_DCN_RATE_LIMITER.setRate(newRate); + } + } catch (NullPointerException | IllegalArgumentException e) { + LOGGER.warn("Could not deduce Crossref API's rate limit from response header. API might have changed"); + } + } + @Override public List performSearch(BibEntry entry) throws FetcherException { Optional doi = entry.getField(StandardField.DOI); @@ -140,7 +223,7 @@ public Optional getAgency(DOI doi) throws IOException { agency = Optional.ofNullable(response.optString("RA")); } } catch (JSONException e) { - LOGGER.error("Cannot parse agency fetcher repsonse to JSON"); + LOGGER.error("Cannot parse agency fetcher response to JSON"); return Optional.empty(); } diff --git a/src/main/java/org/jabref/logic/net/URLDownload.java b/src/main/java/org/jabref/logic/net/URLDownload.java index c499dcf3635..d17682b8201 100644 --- a/src/main/java/org/jabref/logic/net/URLDownload.java +++ b/src/main/java/org/jabref/logic/net/URLDownload.java @@ -60,8 +60,9 @@ * dl.toFile(Path); // available in FILE * String contentType = dl.getMimeType(); * - * - * Each call to a public method creates a new HTTP connection. Nothing is cached. + *

+ * Almost each call to a public method creates a new HTTP connection (except for {@link #asString(Charset, URLConnection) asString}, + * which uses an already opened connection). Nothing is cached. */ public class URLDownload { @@ -231,6 +232,15 @@ public void setPostData(String postData) { } } + /** + * Downloads the web resource to a String. Uses UTF-8 as encoding. + * + * @return the downloaded string + */ + public String asString() throws IOException { + return asString(StandardCharsets.UTF_8, this.openConnection()); + } + /** * Downloads the web resource to a String. * @@ -238,20 +248,33 @@ public void setPostData(String postData) { * @return the downloaded string */ public String asString(Charset encoding) throws IOException { - try (InputStream input = new BufferedInputStream(this.openConnection().getInputStream()); - Writer output = new StringWriter()) { - copy(input, output, encoding); - return output.toString(); - } + return asString(encoding, this.openConnection()); } /** - * Downloads the web resource to a String. Uses UTF-8 as encoding. + * Downloads the web resource to a String from an existing connection. Uses UTF-8 as encoding. * + * @param existingConnection an existing connection * @return the downloaded string */ - public String asString() throws IOException { - return asString(StandardCharsets.UTF_8); + public static String asString(URLConnection existingConnection) throws IOException { + return asString(StandardCharsets.UTF_8, existingConnection); + } + + /** + * Downloads the web resource to a String. + * + * @param encoding the desired String encoding + * @param connection an existing connection + * @return the downloaded string + */ + public static String asString(Charset encoding, URLConnection connection) throws IOException { + + try (InputStream input = new BufferedInputStream(connection.getInputStream()); + Writer output = new StringWriter()) { + copy(input, output, encoding); + return output.toString(); + } } public List getCookieFromUrl() throws IOException { @@ -325,7 +348,7 @@ public String toString() { return "URLDownload{" + "source=" + this.source + '}'; } - private void copy(InputStream in, Writer out, Charset encoding) throws IOException { + private static void copy(InputStream in, Writer out, Charset encoding) throws IOException { Reader r = new InputStreamReader(in, encoding); try (BufferedReader read = new BufferedReader(r)) { String line; @@ -336,7 +359,13 @@ private void copy(InputStream in, Writer out, Charset encoding) throws IOExcepti } } - private URLConnection openConnection() throws IOException { + /** + * Open a connection to this object's URL (with specified settings). If accessing an HTTP URL, don't forget + * to close the resulting connection after usage. + * + * @return an open connection + */ + public URLConnection openConnection() throws IOException { URLConnection connection = this.source.openConnection(); connection.setConnectTimeout((int) connectTimeout.toMillis()); for (Entry entry : this.parameters.entrySet()) { diff --git a/src/main/java/org/jabref/model/entry/BibEntry.java b/src/main/java/org/jabref/model/entry/BibEntry.java index a7b39696564..38e37c40db7 100644 --- a/src/main/java/org/jabref/model/entry/BibEntry.java +++ b/src/main/java/org/jabref/model/entry/BibEntry.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -10,8 +11,10 @@ import java.util.Objects; import java.util.Optional; import java.util.Set; +import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.function.BiFunction; +import java.util.stream.Collectors; import javafx.beans.Observable; import javafx.beans.property.ObjectProperty; @@ -139,11 +142,11 @@ public Optional getResolvedFieldOrAlias(OrFields fields, BibDatabase dat private Optional getSourceField(Field targetField, EntryType targetEntry, EntryType sourceEntry) { //// 1. Sort out forbidden fields if ((targetField == StandardField.IDS) || - (targetField == StandardField.CROSSREF) || - (targetField == StandardField.XREF) || - (targetField == StandardField.ENTRYSET) || - (targetField == StandardField.RELATED) || - (targetField == StandardField.SORTKEY)) { + (targetField == StandardField.CROSSREF) || + (targetField == StandardField.XREF) || + (targetField == StandardField.ENTRYSET) || + (targetField == StandardField.RELATED) || + (targetField == StandardField.SORTKEY)) { return Optional.empty(); } @@ -185,8 +188,8 @@ private Optional getSourceField(Field targetField, EntryType targetEntry, // those fields are no more available for the same-name inheritance strategy if ((targetField == StandardField.TITLE) || - (targetField == StandardField.SUBTITLE) || - (targetField == StandardField.TITLEADDON)) { + (targetField == StandardField.SUBTITLE) || + (targetField == StandardField.TITLEADDON)) { return Optional.empty(); } @@ -197,12 +200,12 @@ private Optional getSourceField(Field targetField, EntryType targetEntry, } if (((sourceEntry == StandardEntryType.Book) && (targetEntry == StandardEntryType.InBook)) || - ((sourceEntry == StandardEntryType.Book) && (targetEntry == StandardEntryType.BookInBook)) || - ((sourceEntry == StandardEntryType.Book) && (targetEntry == StandardEntryType.SuppBook)) || - ((sourceEntry == StandardEntryType.Collection) && (targetEntry == StandardEntryType.InCollection)) || - ((sourceEntry == StandardEntryType.Collection) && (targetEntry == StandardEntryType.SuppCollection)) || - ((sourceEntry == StandardEntryType.Reference) && (targetEntry == StandardEntryType.InReference)) || - ((sourceEntry == StandardEntryType.Proceedings) && (targetEntry == StandardEntryType.InProceedings))) { + ((sourceEntry == StandardEntryType.Book) && (targetEntry == StandardEntryType.BookInBook)) || + ((sourceEntry == StandardEntryType.Book) && (targetEntry == StandardEntryType.SuppBook)) || + ((sourceEntry == StandardEntryType.Collection) && (targetEntry == StandardEntryType.InCollection)) || + ((sourceEntry == StandardEntryType.Collection) && (targetEntry == StandardEntryType.SuppCollection)) || + ((sourceEntry == StandardEntryType.Reference) && (targetEntry == StandardEntryType.InReference)) || + ((sourceEntry == StandardEntryType.Proceedings) && (targetEntry == StandardEntryType.InProceedings))) { if (targetField == StandardField.BOOKTITLE) { return Optional.of(StandardField.TITLE); } @@ -215,8 +218,8 @@ private Optional getSourceField(Field targetField, EntryType targetEntry, // those fields are no more available for the same-name inheritance strategy if ((targetField == StandardField.TITLE) || - (targetField == StandardField.SUBTITLE) || - (targetField == StandardField.TITLEADDON)) { + (targetField == StandardField.SUBTITLE) || + (targetField == StandardField.TITLEADDON)) { return Optional.empty(); } @@ -227,7 +230,7 @@ private Optional getSourceField(Field targetField, EntryType targetEntry, } if (((sourceEntry == IEEETranEntryType.Periodical) && (targetEntry == StandardEntryType.Article)) || - ((sourceEntry == IEEETranEntryType.Periodical) && (targetEntry == StandardEntryType.SuppPeriodical))) { + ((sourceEntry == IEEETranEntryType.Periodical) && (targetEntry == StandardEntryType.SuppPeriodical))) { if (targetField == StandardField.JOURNALTITLE) { return Optional.of(StandardField.TITLE); } @@ -237,7 +240,7 @@ private Optional getSourceField(Field targetField, EntryType targetEntry, // those fields are no more available for the same-name inheritance strategy if ((targetField == StandardField.TITLE) || - (targetField == StandardField.SUBTITLE)) { + (targetField == StandardField.SUBTITLE)) { return Optional.empty(); } @@ -544,6 +547,7 @@ public void setField(Map fields) { public Optional setField(Field field, String value, EntriesEventSource eventSource) { Objects.requireNonNull(field, "field name must not be null"); Objects.requireNonNull(value, "field value must not be null"); + Objects.requireNonNull(eventSource, "field eventSource must not be null"); if (value.isEmpty()) { return clearField(field); @@ -1044,4 +1048,42 @@ public void replaceDownloadedFile(String linkToDownloadedFile, LinkedFile downlo this.setFiles(linkedFiles); } + + /** + * Merge this entry's fields with another BibEntry. Non-intersecting fields will be automatically merged. In cases of + * intersection, priority is given to THIS entry's field value. + * + * @param other another BibEntry from which fields are sourced from + */ + public void mergeWith(BibEntry other) { + mergeWith(other, Set.of()); + } + + /** + * Merge this entry's fields with another BibEntry. Non-intersecting fields will be automatically merged. In cases of + * intersection, priority is given to THIS entry's field value, UNLESS specified otherwise in the arguments. + * + * @param other another BibEntry from which fields are sourced from + * @param otherPrioritizedFields collection of Fields in which 'other' has a priority into final result + */ + public void mergeWith(BibEntry other, Set otherPrioritizedFields) { + Set thisFields = new TreeSet<>(Comparator.comparing(Field::getName)); + Set otherFields = new TreeSet<>(Comparator.comparing(Field::getName)); + + thisFields.addAll(this.getFields()); + otherFields.addAll(other.getFields()); + + // At the moment, "Field" interface does not provide explicit equality, so using their names instead. + Set thisFieldsNames = thisFields.stream().map(Field::getName).collect(Collectors.toSet()); + Set otherPrioritizedFieldsNames = otherPrioritizedFields.stream().map(Field::getName).collect(Collectors.toSet()); + + for (Field otherField : otherFields) { + Optional otherFieldValue = other.getField(otherField); + if (!thisFieldsNames.contains(otherField.getName()) || + otherPrioritizedFieldsNames.contains(otherField.getName())) { + // As iterator only goes through non-null fields from OTHER, otherFieldValue can never be empty + otherFieldValue.ifPresent(s -> this.setField(otherField, s)); + } + } + } } diff --git a/src/test/java/org/jabref/logic/importer/WebFetchersTest.java b/src/test/java/org/jabref/logic/importer/WebFetchersTest.java index 95c9c32e642..1550416fad8 100644 --- a/src/test/java/org/jabref/logic/importer/WebFetchersTest.java +++ b/src/test/java/org/jabref/logic/importer/WebFetchersTest.java @@ -2,6 +2,7 @@ import java.util.Collection; import java.util.HashSet; +import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; @@ -23,6 +24,8 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Answers; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.mock; @@ -30,6 +33,9 @@ class WebFetchersTest { + private static final Logger LOGGER = LoggerFactory.getLogger(WebFetchersTest.class); + private static final Set IGNORED_INACCESSIBLE_FETCHERS = Set.of("ArXivFetcher$ArXiv"); + private ImportFormatPreferences importFormatPreferences; private ImporterPreferences importerPreferences; private final ClassGraph classGraph = new ClassGraph().enableAllInfo().acceptPackages("org.jabref"); @@ -42,6 +48,19 @@ void setUp() { when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn(fieldContentFormatterPreferences); } + private Set> getIgnoredInaccessibleClasses() { + return IGNORED_INACCESSIBLE_FETCHERS.stream() + .map(className -> "org.jabref.logic.importer.fetcher." + className) + .map(classPath -> { + try { + return Class.forName(classPath); + } catch (ClassNotFoundException e) { + LOGGER.error("Some of the ignored classes were not found {}", e); + return null; + } + }).filter(Objects::nonNull).collect(Collectors.toSet()); + } + @Test void getIdBasedFetchersReturnsAllFetcherDerivingFromIdBasedFetcher() { Set idFetchers = WebFetchers.getIdBasedFetchers(importFormatPreferences, importerPreferences); @@ -50,6 +69,9 @@ void getIdBasedFetchersReturnsAllFetcherDerivingFromIdBasedFetcher() { ClassInfoList controlClasses = scanResult.getClassesImplementing(IdBasedFetcher.class.getCanonicalName()); Set> expected = new HashSet<>(controlClasses.loadClasses()); + // Some classes implement IdBasedFetcher, but are only accessible to other fetcher, so ignore them + expected.removeAll(getIgnoredInaccessibleClasses()); + expected.remove(AbstractIsbnFetcher.class); expected.remove(IdBasedParserFetcher.class); @@ -91,6 +113,9 @@ void getSearchBasedFetchersReturnsAllFetcherDerivingFromSearchBasedFetcher() { ClassInfoList controlClasses = scanResult.getClassesImplementing(SearchBasedFetcher.class.getCanonicalName()); Set> expected = new HashSet<>(controlClasses.loadClasses()); + // Some classes implement SearchBasedFetcher, but are only accessible to other fetcher, so ignore them + expected.removeAll(getIgnoredInaccessibleClasses()); + // Remove interfaces expected.remove(SearchBasedParserFetcher.class); @@ -116,6 +141,9 @@ void getFullTextFetchersReturnsAllFetcherDerivingFromFullTextFetcher() { ClassInfoList controlClasses = scanResult.getClassesImplementing(FulltextFetcher.class.getCanonicalName()); Set> expected = new HashSet<>(controlClasses.loadClasses()); + // Some classes implement FulltextFetcher, but are only accessible to other fetcher, so ignore them + expected.removeAll(getIgnoredInaccessibleClasses()); + // Remove the following, because they don't work atm expected.remove(JstorFetcher.class); expected.remove(GoogleScholar.class); @@ -132,6 +160,9 @@ void getIdFetchersReturnsAllFetcherDerivingFromIdFetcher() { ClassInfoList controlClasses = scanResult.getClassesImplementing(IdFetcher.class.getCanonicalName()); Set> expected = new HashSet<>(controlClasses.loadClasses()); + // Some classes implement IdFetcher, but are only accessible to other fetcher, so ignore them + expected.removeAll(getIgnoredInaccessibleClasses()); + expected.remove(IdParserFetcher.class); // Remove the following, because they don't work at the moment expected.remove(GoogleScholar.class); diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ArXivFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ArXivFetcherTest.java new file mode 100644 index 00000000000..d7173fc1027 --- /dev/null +++ b/src/test/java/org/jabref/logic/importer/fetcher/ArXivFetcherTest.java @@ -0,0 +1,588 @@ +package org.jabref.logic.importer.fetcher; + +import java.io.IOException; +import java.net.URL; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.StringJoiner; +import java.util.stream.Collectors; + +import org.jabref.logic.bibtex.FieldContentFormatterPreferences; +import org.jabref.logic.importer.FetcherException; +import org.jabref.logic.importer.ImportCleanup; +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.PagedSearchBasedFetcher; +import org.jabref.logic.importer.SearchBasedFetcher; +import org.jabref.model.database.BibDatabaseMode; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.InternalField; +import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.field.UnknownField; +import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.model.entry.types.StandardEntryType; +import org.jabref.testutils.category.FetcherTest; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +@FetcherTest +class ArXivFetcherTest implements SearchBasedFetcherCapabilityTest, PagedSearchFetcherTest { + private static ImportFormatPreferences importFormatPreferences; + + private ArXivFetcher fetcher; + private BibEntry entry; + private BibEntry sliceTheoremPaper; + + private BibEntry mainOriginalPaper; + private BibEntry mainResultPaper; + + private BibEntry completePaper; + + @BeforeAll + static void setUp() { + importFormatPreferences = mock(ImportFormatPreferences.class); + when(importFormatPreferences.getKeywordSeparator()).thenReturn(','); + // Used during DOI fetch process + when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn( + new FieldContentFormatterPreferences( + Arrays.stream("pdf;ps;url;doi;file;isbn;issn".split(";")) + .map(fieldName -> StandardField.fromName(fieldName).isPresent() ? StandardField.fromName(fieldName).get() : new UnknownField(fieldName)) + .collect(Collectors.toList()))); + } + + @BeforeEach + void eachSetUp() { + fetcher = new ArXivFetcher(importFormatPreferences); + entry = new BibEntry(); + + // A BibEntry with information only from ArXiv API + mainOriginalPaper = new BibEntry(StandardEntryType.Article) + // ArXiv-original fields + .withField(StandardField.AUTHOR, "Joeran Beel and Andrew Collins and Akiko Aizawa") + .withField(StandardField.TITLE, "The Architecture of Mr. DLib's Scientific Recommender-System API") + .withField(StandardField.DATE, "2018-11-26") + .withField(StandardField.ABSTRACT, "Recommender systems in academia are not widely available. This may be in part due to the difficulty and cost of developing and maintaining recommender systems. Many operators of academic products such as digital libraries and reference managers avoid this effort, although a recommender system could provide significant benefits to their users. In this paper, we introduce Mr. DLib's \"Recommendations as-a-Service\" (RaaS) API that allows operators of academic products to easily integrate a scientific recommender system into their products. Mr. DLib generates recommendations for research articles but in the future, recommendations may include call for papers, grants, etc. Operators of academic products can request recommendations from Mr. DLib and display these recommendations to their users. Mr. DLib can be integrated in just a few hours or days; creating an equivalent recommender system from scratch would require several months for an academic operator. Mr. DLib has been used by GESIS Sowiport and by the reference manager JabRef. Mr. DLib is open source and its goal is to facilitate the application of, and research on, scientific recommender systems. In this paper, we present the motivation for Mr. DLib, the architecture and details about the effectiveness. Mr. DLib has delivered 94m recommendations over a span of two years with an average click-through rate of 0.12%.") + .withField(StandardField.EPRINT, "1811.10364") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1811.10364v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "cs.IR") + .withField(StandardField.KEYWORDS, "cs.IR, cs.AI, cs.DL, cs.LG"); + + mainResultPaper = new BibEntry(StandardEntryType.Article) + // ArXiv-original fields +// .withField(StandardField.AUTHOR, "Joeran Beel and Andrew Collins and Akiko Aizawa") + .withField(StandardField.TITLE, "The Architecture of Mr. DLib's Scientific Recommender-System API") + .withField(StandardField.DATE, "2018-11-26") + .withField(StandardField.ABSTRACT, "Recommender systems in academia are not widely available. This may be in part due to the difficulty and cost of developing and maintaining recommender systems. Many operators of academic products such as digital libraries and reference managers avoid this effort, although a recommender system could provide significant benefits to their users. In this paper, we introduce Mr. DLib's \"Recommendations as-a-Service\" (RaaS) API that allows operators of academic products to easily integrate a scientific recommender system into their products. Mr. DLib generates recommendations for research articles but in the future, recommendations may include call for papers, grants, etc. Operators of academic products can request recommendations from Mr. DLib and display these recommendations to their users. Mr. DLib can be integrated in just a few hours or days; creating an equivalent recommender system from scratch would require several months for an academic operator. Mr. DLib has been used by GESIS Sowiport and by the reference manager JabRef. Mr. DLib is open source and its goal is to facilitate the application of, and research on, scientific recommender systems. In this paper, we present the motivation for Mr. DLib, the architecture and details about the effectiveness. Mr. DLib has delivered 94m recommendations over a span of two years with an average click-through rate of 0.12%.") + .withField(StandardField.EPRINT, "1811.10364") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1811.10364v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "cs.IR") +// .withField(StandardField.KEYWORDS, "cs.IR, cs.AI, cs.DL, cs.LG") + // Unavailable info: + // StandardField.JOURNALTITLE // INFO NOT APPLICABLE TO THIS ENTRY + // ArXiv-issue DOI fields + .withField(new UnknownField("copyright"), "arXiv.org perpetual, non-exclusive license") + .withField((InternalField.KEY_FIELD), "https://doi.org/10.48550/arxiv.1811.10364") + .withField(StandardField.YEAR, "2018") + .withField(StandardField.KEYWORDS, "Information Retrieval (cs.IR), Artificial Intelligence (cs.AI), Digital Libraries (cs.DL), Machine Learning (cs.LG), FOS: Computer and information sciences") + .withField(StandardField.AUTHOR, "Beel, Joeran and Collins, Andrew and Aizawa, Akiko") + .withField(StandardField.PUBLISHER, "arXiv") + .withField(StandardField.DOI, "10.48550/ARXIV.1811.10364"); + + // Example of a robust result, with information from both ArXiv-assigned and user-assigned DOIs + completePaper = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Büscher, Tobias and Diez, Angel L. and Gompper, Gerhard and Elgeti, Jens") + .withField(StandardField.TITLE, "Instability and fingering of interfaces in growing tissue") + .withField(StandardField.DATE, "2020-03-10") + .withField(StandardField.YEAR, "2020") + .withField(StandardField.MONTH, "aug") + .withField(StandardField.NUMBER, "8") + .withField(StandardField.VOLUME, "22") + .withField(StandardField.PAGES, "083005") + .withField(StandardField.PUBLISHER, "{IOP} Publishing") + .withField(StandardField.JOURNAL, "New Journal of Physics") + .withField(StandardField.ABSTRACT, "Interfaces in tissues are ubiquitous, both between tissue and environment as well as between populations of different cell types. The propagation of an interface can be driven mechanically. % e.g. by a difference in the respective homeostatic stress of the different cell types. Computer simulations of growing tissues are employed to study the stability of the interface between two tissues on a substrate. From a mechanical perspective, the dynamics and stability of this system is controlled mainly by four parameters of the respective tissues: (i) the homeostatic stress (ii) cell motility (iii) tissue viscosity and (iv) substrate friction. For propagation driven by a difference in homeostatic stress, the interface is stable for tissue-specific substrate friction even for very large differences of homeostatic stress; however, it becomes unstable above a critical stress difference when the tissue with the larger homeostatic stress has a higher viscosity. A small difference in directed bulk motility between the two tissues suffices to result in propagation with a stable interface, even for otherwise identical tissues. Larger differences in motility force, however, result in a finite-wavelength instability of the interface. Interestingly, the instability is apparently bound by nonlinear effects and the amplitude of the interface undulations only grows to a finite value in time.") + .withField(StandardField.DOI, "10.1088/1367-2630/ab9e88") + .withField(StandardField.EPRINT, "2003.04601") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/2003.04601v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "q-bio.TO") + .withField(StandardField.KEYWORDS, "Tissues and Organs (q-bio.TO), FOS: Biological sciences") + .withField(InternalField.KEY_FIELD, "B_scher_2020") + .withField(new UnknownField("copyright"), "arXiv.org perpetual, non-exclusive license"); + + sliceTheoremPaper = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Diez, Tobias") + .withField(StandardField.TITLE, "Slice theorem for Fréchet group actions and covariant symplectic field theory") + .withField(StandardField.DATE, "2014-05-09") + .withField(StandardField.YEAR, "2014") + .withField(StandardField.PUBLISHER, "arXiv") + .withField(StandardField.ABSTRACT, "A general slice theorem for the action of a Fr\\'echet Lie group on a Fr\\'echet manifolds is established. The Nash-Moser theorem provides the fundamental tool to generalize the result of Palais to this infinite-dimensional setting. The presented slice theorem is illustrated by its application to gauge theories: the action of the gauge transformation group admits smooth slices at every point and thus the gauge orbit space is stratified by Fr\\'echet manifolds. Furthermore, a covariant and symplectic formulation of classical field theory is proposed and extensively discussed. At the root of this novel framework is the incorporation of field degrees of freedom F and spacetime M into the product manifold F * M. The induced bigrading of differential forms is used in order to carry over the usual symplectic theory to this new setting. The examples of the Klein-Gordon field and general Yang-Mills theory illustrate that the presented approach conveniently handles the occurring symmetries.") + .withField(StandardField.DOI, "10.48550/ARXIV.1405.2249") + .withField(StandardField.EPRINT, "1405.2249") + .withField(StandardField.EPRINTCLASS, "math-ph") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1405.2249v1:PDF") + .withField(StandardField.KEYWORDS, "Mathematical Physics (math-ph), Differential Geometry (math.DG), Symplectic Geometry (math.SG), FOS: Physical sciences, FOS: Mathematics, 58B99, 58Z05, 58B25, 22E65, 58D19, 53D20, 53D42") + .withField(InternalField.KEY_FIELD, "https://doi.org/10.48550/arxiv.1405.2249") + .withField(new UnknownField("copyright"), "arXiv.org perpetual, non-exclusive license"); + } + + @Override + public SearchBasedFetcher getFetcher() { + return fetcher; + } + + public List getInputTestAuthors() { + return Arrays.stream(mainOriginalPaper.getField(StandardField.AUTHOR).get() + .split("and")).map(String::trim).collect(Collectors.toList()); + } + + @Override + public List getTestAuthors() { + return Arrays.stream(mainResultPaper.getField(StandardField.AUTHOR).get() + .split("and")).map(String::trim).collect(Collectors.toList()); + } + + @Override + public String getTestJournal() { + return "Journal of Geometry and Physics (2013)"; + } + + @Override + public PagedSearchBasedFetcher getPagedFetcher() { + return fetcher; + } + + @Test + @Override + public void supportsAuthorSearch() throws FetcherException { + StringJoiner queryBuilder = new StringJoiner("\" AND author:\"", "author:\"", "\""); + getInputTestAuthors().forEach(queryBuilder::add); + + List result = getFetcher().performSearch(queryBuilder.toString()); + new ImportCleanup(BibDatabaseMode.BIBTEX).doPostCleanup(result); + + assertFalse(result.isEmpty()); + result.forEach(bibEntry -> { + String author = bibEntry.getField(StandardField.AUTHOR).orElse(""); + + // The co-authors differ, thus we check for the author present at all papers + getTestAuthors().forEach(expectedAuthor -> Assertions.assertTrue(author.contains(expectedAuthor.replace("\"", "")))); + }); + } + + @Test + public void noSupportsAuthorSearchWithLastFirstName() throws FetcherException { + StringJoiner queryBuilder = new StringJoiner("\" AND author:\"", "author:\"", "\""); + getTestAuthors().forEach(queryBuilder::add); + + List result = getFetcher().performSearch(queryBuilder.toString()); + new ImportCleanup(BibDatabaseMode.BIBTEX).doPostCleanup(result); + + assertTrue(result.isEmpty()); + } + + @Test + void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException { + assertEquals(Optional.empty(), fetcher.findFullText(entry)); + } + + @Test + void findFullTextRejectsNullParameter() { + assertThrows(NullPointerException.class, () -> fetcher.findFullText(null)); + } + + @Test + void findFullTextByDOI() throws IOException { + entry.setField(StandardField.DOI, "10.1529/biophysj.104.047340"); + entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByEprint() throws IOException { + entry.setField(StandardField.EPRINT, "1603.06570"); + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByEprintWithPrefix() throws IOException { + entry.setField(StandardField.EPRINT, "arXiv:1603.06570"); + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByEprintWithUnknownDOI() throws IOException { + entry.setField(StandardField.DOI, "10.1529/unknown"); + entry.setField(StandardField.EPRINT, "1603.06570"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByTitle() throws IOException { + entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByTitleWithCurlyBracket() throws IOException { + entry.setField(StandardField.TITLE, "Machine versus {Human} {Attention} in {Deep} {Reinforcement} {Learning} {Tasks}"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/2010.15942v3")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByTitleWithColonAndJournalWithoutEprint() throws IOException { + entry.setField(StandardField.TITLE, "Bayes-TrEx: a Bayesian Sampling Approach to Model Transparency by Example"); + entry.setField(StandardField.JOURNAL, "arXiv:2002.10248v4 [cs]"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/2002.10248v4")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByTitleWithColonAndUrlWithoutEprint() throws IOException { + entry.setField(StandardField.TITLE, "Bayes-TrEx: a Bayesian Sampling Approach to Model Transparency by Example"); + entry.setField(StandardField.URL, "http://arxiv.org/abs/2002.10248v4"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/2002.10248v4")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByTitleAndPartOfAuthor() throws IOException { + entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); + entry.setField(StandardField.AUTHOR, "Weeks and Lucks"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByTitleWithCurlyBracketAndPartOfAuthor() throws IOException { + entry.setField(StandardField.TITLE, "Machine versus {Human} {Attention} in {Deep} {Reinforcement} {Learning} {Tasks}"); + entry.setField(StandardField.AUTHOR, "Zhang, Ruohan and Guo"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/2010.15942v3")), fetcher.findFullText(entry)); + } + + @Test + void notFindFullTextByUnknownDOI() throws IOException { + entry.setField(StandardField.DOI, "10.1529/unknown"); + assertEquals(Optional.empty(), fetcher.findFullText(entry)); + } + + @Test + void notFindFullTextByUnknownId() throws IOException { + entry.setField(StandardField.EPRINT, "1234.12345"); + assertEquals(Optional.empty(), fetcher.findFullText(entry)); + } + + @Test + void findFullTextByDOINotAvailableInCatalog() throws IOException { + entry.setField(StandardField.DOI, "10.1016/0370-2693(77)90015-6"); + entry.setField(StandardField.TITLE, "Superspace formulation of supergravity"); + + assertEquals(Optional.empty(), fetcher.findFullText(entry)); + } + + @Test + void findFullTextEntityWithoutDoi() throws IOException { + assertEquals(Optional.empty(), fetcher.findFullText(entry)); + } + + @Test + void findFullTextTrustLevel() { + assertEquals(TrustLevel.PREPRINT, fetcher.getTrustLevel()); + } + + @Test + void searchEntryByPartOfTitle() throws Exception { + assertEquals(Collections.singletonList(mainResultPaper), + fetcher.performSearch("title:\"the architecture of mr. dLib's\"")); + } + + @Test + void searchEntryByPartOfTitleWithAcuteAccent() throws Exception { + assertEquals(Collections.singletonList(sliceTheoremPaper), + fetcher.performSearch("title:\"slice theorem for Fréchet\"")); + } + + @Test + void searchEntryByOldId() throws Exception { + BibEntry expected = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "{H1 Collaboration}") + .withField(StandardField.TITLE, "Multi-Electron Production at High Transverse Momenta in ep Collisions at HERA") + .withField(StandardField.NUMBER, "1") + .withField(StandardField.VOLUME, "31") + .withField(StandardField.PAGES, "17--29") + .withField(StandardField.DATE, "2003-07-07") + .withField(StandardField.YEAR, "2003") + .withField(StandardField.MONTH, "oct") + .withField(StandardField.ABSTRACT, "Multi-electron production is studied at high electron transverse momentum in positron- and electron-proton collisions using the H1 detector at HERA. The data correspond to an integrated luminosity of 115 pb-1. Di-electron and tri-electron event yields are measured. Cross sections are derived in a restricted phase space region dominated by photon-photon collisions. In general good agreement is found with the Standard Model predictions. However, for electron pair invariant masses above 100 GeV, three di-electron events and three tri-electron events are observed, compared to Standard Model expectations of 0.30 \\pm 0.04 and 0.23 \\pm 0.04, respectively.") + .withField(StandardField.PUBLISHER, "Springer Science and Business Media {LLC}") + .withField(StandardField.EPRINT, "hep-ex/0307015") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/hep-ex/0307015v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "hep-ex") + .withField(StandardField.KEYWORDS, "High Energy Physics - Experiment (hep-ex), FOS: Physical sciences") + .withField(StandardField.DOI, "10.1140/epjc/s2003-01326-x") + .withField(StandardField.JOURNAL, "Eur.Phys.J.C31:17-29,2003") + .withField(InternalField.KEY_FIELD, "2003") + .withField(new UnknownField("copyright"), "Assumed arXiv.org perpetual, non-exclusive license to distribute this article for submissions made before January 2004"); + + assertEquals(Optional.of(expected), fetcher.performSearchById("hep-ex/0307015")); + } + + @Test + void searchEntryByIdWith4DigitsAndVersion() throws Exception { + assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("1405.2249v1")); + } + + @Test + void searchEntryByIdWith4Digits() throws Exception { + assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("1405.2249")); + } + + @Test + void searchEntryByIdWith4DigitsAndPrefix() throws Exception { + assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("arXiv:1405.2249")); + } + + @Test + void searchEntryByIdWith4DigitsAndPrefixAndNotTrimmed() throws Exception { + assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("arXiv : 1405. 2249")); + } + + @Test + void searchEntryByIdWith5Digits() throws Exception { + assertEquals(Optional.of( + "An Optimal Convergence Theorem for Mean Curvature Flow of Arbitrary Codimension in Hyperbolic Spaces"), + fetcher.performSearchById("1503.06747").flatMap(entry -> entry.getField(StandardField.TITLE))); + } + + @Test + void searchWithMalformedIdReturnsEmpty() throws Exception { + assertEquals(Optional.empty(), fetcher.performSearchById("123412345")); + } + + @Test + void searchIdentifierForSlicePaper() throws Exception { + sliceTheoremPaper.clearField(StandardField.EPRINT); + + assertEquals(ArXivIdentifier.parse("1405.2249"), fetcher.findIdentifier(sliceTheoremPaper)); + } + + @Test + void searchEmptyId() throws Exception { + assertEquals(Optional.empty(), fetcher.performSearchById("")); + } + + @Test + void searchWithHttpUrl() throws Exception { + assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("http://arxiv.org/abs/1405.2249")); + } + + @Test + void searchWithHttpsUrl() throws Exception { + assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("https://arxiv.org/abs/1405.2249")); + } + + @Test + void searchWithHttpsUrlNotTrimmed() throws Exception { + assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("https : // arxiv . org / abs / 1405 . 2249 ")); + } + + @Disabled("Is not supported by the current API") + @Test + @Override + public void supportsYearSearch() throws Exception { + } + + @Disabled("Is not supported by the current API") + @Test + @Override + public void supportsYearRangeSearch() throws Exception { + } + + /** + * A phrase is a sequence of terms wrapped in quotes. + * Only documents that contain exactly this sequence are returned. + */ + @Test + public void supportsPhraseSearch() throws Exception { + List resultWithPhraseSearch = fetcher.performSearch("title:\"Taxonomy of Distributed\""); + List resultWithOutPhraseSearch = fetcher.performSearch("title:Taxonomy AND title:of AND title:Distributed"); + // Phrase search result has to be subset of the default search result + assertTrue(resultWithOutPhraseSearch.containsAll(resultWithPhraseSearch)); + } + + /** + * A phrase is a sequence of terms wrapped in quotes. + * Only documents that contain exactly this sequence are returned. + */ + @Test + public void supportsPhraseSearchAndMatchesExact() throws Exception { + BibEntry expected = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Rafrastara, Fauzi Adi and Deyu, Qi") + .withField(StandardField.TITLE, "A Survey and Taxonomy of Distributed Data Mining Research Studies: A Systematic Literature Review") + .withField(StandardField.DATE, "2020-09-14") + .withField(StandardField.YEAR, "2020") + .withField(StandardField.PUBLISHER, "arXiv") + .withField(StandardField.ABSTRACT, "Context: Data Mining (DM) method has been evolving year by year and as of today there is also the enhancement of DM technique that can be run several times faster than the traditional one, called Distributed Data Mining (DDM). It is not a new field in data processing actually, but in the recent years many researchers have been paying more attention on this area. Problems: The number of publication regarding DDM in high reputation journals and conferences has increased significantly. It makes difficult for researchers to gain a comprehensive view of DDM that require further research. Solution: We conducted a systematic literature review to map the previous research in DDM field. Our objective is to provide the motivation for new research by identifying the gap in DDM field as well as the hot area itself. Result: Our analysis came up with some conclusions by answering 7 research questions proposed in this literature review. In addition, the taxonomy of DDM research area is presented in this paper. Finally, this systematic literature review provides the statistic of development of DDM since 2000 to 2015, in which this will help the future researchers to have a comprehensive overview of current situation of DDM.") + .withField(StandardField.EPRINT, "2009.10618") + .withField(StandardField.DOI, "10.48550/ARXIV.2009.10618") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/2009.10618v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "cs.DC") + .withField(StandardField.KEYWORDS, "Distributed / Parallel / Cluster Computing (cs.DC), Machine Learning (cs.LG), FOS: Computer and information sciences") + .withField(InternalField.KEY_FIELD, "https://doi.org/10.48550/arxiv.2009.10618") + .withField(new UnknownField("copyright"), "arXiv.org perpetual, non-exclusive license"); + + List resultWithPhraseSearch = fetcher.performSearch("title:\"Taxonomy of Distributed\""); + + // There is only a single paper found by searching that contains the exact sequence "Taxonomy of Distributed" in the title. + assertEquals(Collections.singletonList(expected), resultWithPhraseSearch); + } + + @Test + public void supportsBooleanANDSearch() throws Exception { + BibEntry expected = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Büscher, Tobias and Diez, Angel L. and Gompper, Gerhard and Elgeti, Jens") + .withField(StandardField.TITLE, "Instability and fingering of interfaces in growing tissue") + .withField(StandardField.DATE, "2020-03-10") + .withField(StandardField.YEAR, "2020") + .withField(StandardField.MONTH, "aug") + .withField(StandardField.NUMBER, "8") + .withField(StandardField.VOLUME, "22") + .withField(StandardField.PAGES, "083005") + .withField(StandardField.PUBLISHER, "{IOP} Publishing") + .withField(StandardField.JOURNAL, "New Journal of Physics") + .withField(StandardField.ABSTRACT, "Interfaces in tissues are ubiquitous, both between tissue and environment as well as between populations of different cell types. The propagation of an interface can be driven mechanically. % e.g. by a difference in the respective homeostatic stress of the different cell types. Computer simulations of growing tissues are employed to study the stability of the interface between two tissues on a substrate. From a mechanical perspective, the dynamics and stability of this system is controlled mainly by four parameters of the respective tissues: (i) the homeostatic stress (ii) cell motility (iii) tissue viscosity and (iv) substrate friction. For propagation driven by a difference in homeostatic stress, the interface is stable for tissue-specific substrate friction even for very large differences of homeostatic stress; however, it becomes unstable above a critical stress difference when the tissue with the larger homeostatic stress has a higher viscosity. A small difference in directed bulk motility between the two tissues suffices to result in propagation with a stable interface, even for otherwise identical tissues. Larger differences in motility force, however, result in a finite-wavelength instability of the interface. Interestingly, the instability is apparently bound by nonlinear effects and the amplitude of the interface undulations only grows to a finite value in time.") + .withField(StandardField.DOI, "10.1088/1367-2630/ab9e88") + .withField(StandardField.EPRINT, "2003.04601") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/2003.04601v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "q-bio.TO") + .withField(StandardField.KEYWORDS, "Tissues and Organs (q-bio.TO), FOS: Biological sciences") + .withField(InternalField.KEY_FIELD, "B_scher_2020") + .withField(new UnknownField("copyright"), "arXiv.org perpetual, non-exclusive license"); + + List result = fetcher.performSearch("author:\"Tobias Büscher\" AND title:\"Instability and fingering of interfaces\""); + + // There is only one paper authored by Tobias Büscher with that phrase in the title + assertEquals(Collections.singletonList(expected), result); + } + + @Test + public void retrievePureArxivEntryWhenAllDOIFetchingFails() throws FetcherException { + BibEntry expected = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Hai Zheng and Po-Yi Ho and Meiling Jiang and Bin Tang and Weirong Liu and Dengjin Li and Xuefeng Yu and Nancy E. Kleckner and Ariel Amir and Chenli Liu") + .withField(StandardField.TITLE, "Interrogating the Escherichia coli cell cycle by cell dimension perturbations") + .withField(StandardField.DATE, "2017-01-03") + .withField(StandardField.JOURNAL, "PNAS December 27, 2016 vol. 113 no. 52 15000-15005") + .withField(StandardField.ABSTRACT, "Bacteria tightly regulate and coordinate the various events in their cell cycles to duplicate themselves accurately and to control their cell sizes. Growth of Escherichia coli, in particular, follows a relation known as Schaechter 's growth law. This law says that the average cell volume scales exponentially with growth rate, with a scaling exponent equal to the time from initiation of a round of DNA replication to the cell division at which the corresponding sister chromosomes segregate. Here, we sought to test the robustness of the growth law to systematic perturbations in cell dimensions achieved by varying the expression levels of mreB and ftsZ. We found that decreasing the mreB level resulted in increased cell width, with little change in cell length, whereas decreasing the ftsZ level resulted in increased cell length. Furthermore, the time from replication termination to cell division increased with the perturbed dimension in both cases. Moreover, the growth law remained valid over a range of growth conditions and dimension perturbations. The growth law can be quantitatively interpreted as a consequence of a tight coupling of cell division to replication initiation. Thus, its robustness to perturbations in cell dimensions strongly supports models in which the timing of replication initiation governs that of cell division, and cell volume is the key phenomenological variable governing the timing of replication initiation. These conclusions are discussed in the context of our recently proposed adder-per-origin model, in which cells add a constant volume per origin between initiations and divide a constant time after initiation.") + .withField(StandardField.DOI, "10.1073/pnas.1617932114") + .withField(StandardField.EPRINT, "1701.00587") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1701.00587v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "q-bio.CB") + .withField(StandardField.KEYWORDS, "q-bio.CB"); + + DoiFetcher modifiedDoiFetcher = Mockito.spy(new DoiFetcher(importFormatPreferences)); + when(modifiedDoiFetcher.performSearchById("10.1073/pnas.1617932114")).thenThrow(new FetcherException("Could not fetch user-assigned DOI")); + when(modifiedDoiFetcher.performSearchById("10.48550/arXiv.1701.00587")).thenThrow(new FetcherException("Could not fetch ArXiv-assigned DOI")); + + ArXivFetcher modifiedArXivFetcher = Mockito.spy(new ArXivFetcher(importFormatPreferences, modifiedDoiFetcher)); + assertEquals(Optional.of(expected), modifiedArXivFetcher.performSearchById("1701.00587")); + } + + @Test + public void canReplicateArXivOnlySearchByPassingNullParameter() throws FetcherException { + BibEntry expected = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Hai Zheng and Po-Yi Ho and Meiling Jiang and Bin Tang and Weirong Liu and Dengjin Li and Xuefeng Yu and Nancy E. Kleckner and Ariel Amir and Chenli Liu") + .withField(StandardField.TITLE, "Interrogating the Escherichia coli cell cycle by cell dimension perturbations") + .withField(StandardField.DATE, "2017-01-03") + .withField(StandardField.JOURNAL, "PNAS December 27, 2016 vol. 113 no. 52 15000-15005") + .withField(StandardField.ABSTRACT, "Bacteria tightly regulate and coordinate the various events in their cell cycles to duplicate themselves accurately and to control their cell sizes. Growth of Escherichia coli, in particular, follows a relation known as Schaechter 's growth law. This law says that the average cell volume scales exponentially with growth rate, with a scaling exponent equal to the time from initiation of a round of DNA replication to the cell division at which the corresponding sister chromosomes segregate. Here, we sought to test the robustness of the growth law to systematic perturbations in cell dimensions achieved by varying the expression levels of mreB and ftsZ. We found that decreasing the mreB level resulted in increased cell width, with little change in cell length, whereas decreasing the ftsZ level resulted in increased cell length. Furthermore, the time from replication termination to cell division increased with the perturbed dimension in both cases. Moreover, the growth law remained valid over a range of growth conditions and dimension perturbations. The growth law can be quantitatively interpreted as a consequence of a tight coupling of cell division to replication initiation. Thus, its robustness to perturbations in cell dimensions strongly supports models in which the timing of replication initiation governs that of cell division, and cell volume is the key phenomenological variable governing the timing of replication initiation. These conclusions are discussed in the context of our recently proposed adder-per-origin model, in which cells add a constant volume per origin between initiations and divide a constant time after initiation.") + .withField(StandardField.DOI, "10.1073/pnas.1617932114") + .withField(StandardField.EPRINT, "1701.00587") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1701.00587v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "q-bio.CB") + .withField(StandardField.KEYWORDS, "q-bio.CB"); + + ArXivFetcher modifiedArXivFetcher = new ArXivFetcher(importFormatPreferences, null); + assertEquals(Optional.of(expected), modifiedArXivFetcher.performSearchById("1701.00587")); + } + + @Test + public void retrievePartialResultWhenCannotGetInformationFromUserAssignedDOI() throws FetcherException { + BibEntry expected = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Zheng, Hai and Ho, Po-Yi and Jiang, Meiling and Tang, Bin and Liu, Weirong and Li, Dengjin and Yu, Xuefeng and Kleckner, Nancy E. and Amir, Ariel and Liu, Chenli") + .withField(StandardField.TITLE, "Interrogating the Escherichia coli cell cycle by cell dimension perturbations") + .withField(StandardField.DATE, "2017-01-03") + .withField(StandardField.JOURNAL, "PNAS December 27, 2016 vol. 113 no. 52 15000-15005") + .withField(StandardField.ABSTRACT, "Bacteria tightly regulate and coordinate the various events in their cell cycles to duplicate themselves accurately and to control their cell sizes. Growth of Escherichia coli, in particular, follows a relation known as Schaechter 's growth law. This law says that the average cell volume scales exponentially with growth rate, with a scaling exponent equal to the time from initiation of a round of DNA replication to the cell division at which the corresponding sister chromosomes segregate. Here, we sought to test the robustness of the growth law to systematic perturbations in cell dimensions achieved by varying the expression levels of mreB and ftsZ. We found that decreasing the mreB level resulted in increased cell width, with little change in cell length, whereas decreasing the ftsZ level resulted in increased cell length. Furthermore, the time from replication termination to cell division increased with the perturbed dimension in both cases. Moreover, the growth law remained valid over a range of growth conditions and dimension perturbations. The growth law can be quantitatively interpreted as a consequence of a tight coupling of cell division to replication initiation. Thus, its robustness to perturbations in cell dimensions strongly supports models in which the timing of replication initiation governs that of cell division, and cell volume is the key phenomenological variable governing the timing of replication initiation. These conclusions are discussed in the context of our recently proposed adder-per-origin model, in which cells add a constant volume per origin between initiations and divide a constant time after initiation.") + .withField(StandardField.DOI, "10.1073/pnas.1617932114") + .withField(StandardField.EPRINT, "1701.00587") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1701.00587v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "q-bio.CB") + .withField(StandardField.KEYWORDS, "Cell Behavior (q-bio.CB), FOS: Biological sciences") + .withField(new UnknownField("copyright"), "arXiv.org perpetual, non-exclusive license") + .withField(InternalField.KEY_FIELD, "https://doi.org/10.48550/arxiv.1701.00587") + .withField(StandardField.YEAR, "2017") + .withField(StandardField.PUBLISHER, "arXiv"); + + DoiFetcher modifiedDoiFetcher = Mockito.spy(new DoiFetcher(importFormatPreferences)); + when(modifiedDoiFetcher.performSearchById("10.1073/pnas.1617932114")).thenThrow(new FetcherException("Could not fetch user-assigned DOI")); + + ArXivFetcher modifiedArXivFetcher = Mockito.spy(new ArXivFetcher(importFormatPreferences, modifiedDoiFetcher)); + assertEquals(Optional.of(expected), modifiedArXivFetcher.performSearchById("1701.00587")); + } + + @Test + public void retrievePartialResultWhenCannotGetInformationFromArXivAssignedDOI() throws FetcherException { + BibEntry expected = new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Hai Zheng and Po-Yi Ho and Meiling Jiang and Bin Tang and Weirong Liu and Dengjin Li and Xuefeng Yu and Nancy E. Kleckner and Ariel Amir and Chenli Liu") + .withField(StandardField.TITLE, "Interrogating the Escherichia coli cell cycle by cell dimension perturbations") + .withField(StandardField.DATE, "2017-01-03") + .withField(StandardField.JOURNAL, "PNAS December 27, 2016 vol. 113 no. 52 15000-15005") + .withField(StandardField.ABSTRACT, "Bacteria tightly regulate and coordinate the various events in their cell cycles to duplicate themselves accurately and to control their cell sizes. Growth of Escherichia coli, in particular, follows a relation known as Schaechter 's growth law. This law says that the average cell volume scales exponentially with growth rate, with a scaling exponent equal to the time from initiation of a round of DNA replication to the cell division at which the corresponding sister chromosomes segregate. Here, we sought to test the robustness of the growth law to systematic perturbations in cell dimensions achieved by varying the expression levels of mreB and ftsZ. We found that decreasing the mreB level resulted in increased cell width, with little change in cell length, whereas decreasing the ftsZ level resulted in increased cell length. Furthermore, the time from replication termination to cell division increased with the perturbed dimension in both cases. Moreover, the growth law remained valid over a range of growth conditions and dimension perturbations. The growth law can be quantitatively interpreted as a consequence of a tight coupling of cell division to replication initiation. Thus, its robustness to perturbations in cell dimensions strongly supports models in which the timing of replication initiation governs that of cell division, and cell volume is the key phenomenological variable governing the timing of replication initiation. These conclusions are discussed in the context of our recently proposed adder-per-origin model, in which cells add a constant volume per origin between initiations and divide a constant time after initiation.") + .withField(StandardField.DOI, "10.1073/pnas.1617932114") + .withField(StandardField.EPRINT, "1701.00587") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1701.00587v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "q-bio.CB") + .withField(StandardField.KEYWORDS, "q-bio.CB") + .withField(StandardField.MONTH, "dec") + .withField(StandardField.YEAR, "2016") + .withField(StandardField.VOLUME, "113") + .withField(InternalField.KEY_FIELD, "Zheng_2016") + .withField(StandardField.PUBLISHER, "Proceedings of the National Academy of Sciences") + .withField(StandardField.PAGES, "15000--15005") + .withField(StandardField.NUMBER, "52"); + + DoiFetcher modifiedDoiFetcher = Mockito.spy(new DoiFetcher(importFormatPreferences)); + when(modifiedDoiFetcher.performSearchById("10.48550/arXiv.1701.00587")).thenThrow(new FetcherException("Could not fetch ArXiv-assigned DOI")); + + ArXivFetcher modifiedArXivFetcher = Mockito.spy(new ArXivFetcher(importFormatPreferences, modifiedDoiFetcher)); + assertEquals(Optional.of(expected), modifiedArXivFetcher.performSearchById("1701.00587")); + } +} diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java deleted file mode 100644 index 0a65fd4bf9e..00000000000 --- a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java +++ /dev/null @@ -1,342 +0,0 @@ -package org.jabref.logic.importer.fetcher; - -import java.io.IOException; -import java.net.URL; -import java.util.Collections; -import java.util.List; -import java.util.Optional; - -import org.jabref.logic.importer.ImportFormatPreferences; -import org.jabref.logic.importer.PagedSearchBasedFetcher; -import org.jabref.logic.importer.SearchBasedFetcher; -import org.jabref.model.entry.BibEntry; -import org.jabref.model.entry.field.StandardField; -import org.jabref.model.entry.identifier.ArXivIdentifier; -import org.jabref.model.entry.types.StandardEntryType; -import org.jabref.testutils.category.FetcherTest; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -@FetcherTest -class ArXivTest implements SearchBasedFetcherCapabilityTest, PagedSearchFetcherTest { - private ArXiv fetcher; - private BibEntry entry; - private BibEntry sliceTheoremPaper; - - @BeforeEach - void setUp() { - ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class); - when(importFormatPreferences.getKeywordSeparator()).thenReturn(','); - fetcher = new ArXiv(importFormatPreferences); - entry = new BibEntry(); - sliceTheoremPaper = new BibEntry(StandardEntryType.Article) - .withField(StandardField.AUTHOR, "Tobias Diez") - .withField(StandardField.TITLE, "Slice theorem for Fréchet group actions and covariant symplectic field theory") - .withField(StandardField.DATE, "2014-05-09") - .withField(StandardField.ABSTRACT, "A general slice theorem for the action of a Fr\\'echet Lie group on a Fr\\'echet manifolds is established. The Nash-Moser theorem provides the fundamental tool to generalize the result of Palais to this infinite-dimensional setting. The presented slice theorem is illustrated by its application to gauge theories: the action of the gauge transformation group admits smooth slices at every point and thus the gauge orbit space is stratified by Fr\\'echet manifolds. Furthermore, a covariant and symplectic formulation of classical field theory is proposed and extensively discussed. At the root of this novel framework is the incorporation of field degrees of freedom F and spacetime M into the product manifold F * M. The induced bigrading of differential forms is used in order to carry over the usual symplectic theory to this new setting. The examples of the Klein-Gordon field and general Yang-Mills theory illustrate that the presented approach conveniently handles the occurring symmetries.") - .withField(StandardField.EPRINT, "1405.2249") - .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1405.2249v1:PDF") - .withField(StandardField.EPRINTTYPE, "arXiv") - .withField(StandardField.EPRINTCLASS, "math-ph") - .withField(StandardField.KEYWORDS, "math-ph, math.DG, math.MP, math.SG, 58B99, 58Z05, 58B25, 22E65, 58D19, 53D20, 53D42"); - } - - @Test - void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException { - assertEquals(Optional.empty(), fetcher.findFullText(entry)); - } - - @Test - void findFullTextRejectsNullParameter() { - assertThrows(NullPointerException.class, () -> fetcher.findFullText(null)); - } - - @Test - void findFullTextByDOI() throws IOException { - entry.setField(StandardField.DOI, "10.1529/biophysj.104.047340"); - entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); - - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByEprint() throws IOException { - entry.setField(StandardField.EPRINT, "1603.06570"); - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByEprintWithPrefix() throws IOException { - entry.setField(StandardField.EPRINT, "arXiv:1603.06570"); - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByEprintWithUnknownDOI() throws IOException { - entry.setField(StandardField.DOI, "10.1529/unknown"); - entry.setField(StandardField.EPRINT, "1603.06570"); - - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByTitle() throws IOException { - entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); - - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByTitleWithCurlyBracket() throws IOException { - entry.setField(StandardField.TITLE, "Machine versus {Human} {Attention} in {Deep} {Reinforcement} {Learning} {Tasks}"); - - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/2010.15942v3")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByTitleWithColonAndJournalWithoutEprint() throws IOException { - entry.setField(StandardField.TITLE, "Bayes-TrEx: a Bayesian Sampling Approach to Model Transparency by Example"); - entry.setField(StandardField.JOURNAL, "arXiv:2002.10248v4 [cs]"); - - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/2002.10248v4")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByTitleWithColonAndUrlWithoutEprint() throws IOException { - entry.setField(StandardField.TITLE, "Bayes-TrEx: a Bayesian Sampling Approach to Model Transparency by Example"); - entry.setField(StandardField.URL, "http://arxiv.org/abs/2002.10248v4"); - - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/2002.10248v4")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByTitleAndPartOfAuthor() throws IOException { - entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); - entry.setField(StandardField.AUTHOR, "Weeks and Lucks"); - - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByTitleWithCurlyBracketAndPartOfAuthor() throws IOException { - entry.setField(StandardField.TITLE, "Machine versus {Human} {Attention} in {Deep} {Reinforcement} {Learning} {Tasks}"); - entry.setField(StandardField.AUTHOR, "Zhang, Ruohan and Guo"); - - assertEquals(Optional.of(new URL("http://arxiv.org/pdf/2010.15942v3")), fetcher.findFullText(entry)); - } - - @Test - void notFindFullTextByUnknownDOI() throws IOException { - entry.setField(StandardField.DOI, "10.1529/unknown"); - assertEquals(Optional.empty(), fetcher.findFullText(entry)); - } - - @Test - void notFindFullTextByUnknownId() throws IOException { - entry.setField(StandardField.EPRINT, "1234.12345"); - assertEquals(Optional.empty(), fetcher.findFullText(entry)); - } - - @Test - void findFullTextByDOINotAvailableInCatalog() throws IOException { - entry.setField(StandardField.DOI, "10.1016/0370-2693(77)90015-6"); - entry.setField(StandardField.TITLE, "Superspace formulation of supergravity"); - - assertEquals(Optional.empty(), fetcher.findFullText(entry)); - } - - @Test - void findFullTextEntityWithoutDoi() throws IOException { - assertEquals(Optional.empty(), fetcher.findFullText(entry)); - } - - @Test - void findFullTextTrustLevel() { - assertEquals(TrustLevel.PREPRINT, fetcher.getTrustLevel()); - } - - @Test - void searchEntryByPartOfTitle() throws Exception { - assertEquals(Collections.singletonList(sliceTheoremPaper), - fetcher.performSearch("title:\"slice theorem for Frechet\"")); - } - - @Test - void searchEntryByPartOfTitleWithAcuteAccent() throws Exception { - assertEquals(Collections.singletonList(sliceTheoremPaper), - fetcher.performSearch("title:\"slice theorem for Fréchet\"")); - } - - @Test - void searchEntryByOldId() throws Exception { - BibEntry expected = new BibEntry(StandardEntryType.Article) - .withField(StandardField.AUTHOR, "H1 Collaboration") - .withField(StandardField.TITLE, "Multi-Electron Production at High Transverse Momenta in ep Collisions at HERA") - .withField(StandardField.DATE, "2003-07-07") - .withField(StandardField.ABSTRACT, "Multi-electron production is studied at high electron transverse momentum in positron- and electron-proton collisions using the H1 detector at HERA. The data correspond to an integrated luminosity of 115 pb-1. Di-electron and tri-electron event yields are measured. Cross sections are derived in a restricted phase space region dominated by photon-photon collisions. In general good agreement is found with the Standard Model predictions. However, for electron pair invariant masses above 100 GeV, three di-electron events and three tri-electron events are observed, compared to Standard Model expectations of 0.30 \\pm 0.04 and 0.23 \\pm 0.04, respectively.") - .withField(StandardField.EPRINT, "hep-ex/0307015") - .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/hep-ex/0307015v1:PDF") - .withField(StandardField.EPRINTTYPE, "arXiv") - .withField(StandardField.EPRINTCLASS, "hep-ex") - .withField(StandardField.KEYWORDS, "hep-ex") - .withField(StandardField.DOI, "10.1140/epjc/s2003-01326-x") - .withField(StandardField.JOURNALTITLE, "Eur.Phys.J.C31:17-29,2003"); - - assertEquals(Optional.of(expected), fetcher.performSearchById("hep-ex/0307015")); - } - - @Test - void searchEntryByIdWith4DigitsAndVersion() throws Exception { - assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("1405.2249v1")); - } - - @Test - void searchEntryByIdWith4Digits() throws Exception { - assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("1405.2249")); - } - - @Test - void searchEntryByIdWith4DigitsAndPrefix() throws Exception { - assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("arXiv:1405.2249")); - } - - @Test - void searchEntryByIdWith4DigitsAndPrefixAndNotTrimmed() throws Exception { - assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("arXiv : 1405. 2249")); - } - - @Test - void searchEntryByIdWith5Digits() throws Exception { - assertEquals(Optional.of( - "An Optimal Convergence Theorem for Mean Curvature Flow of Arbitrary Codimension in Hyperbolic Spaces"), - fetcher.performSearchById("1503.06747").flatMap(entry -> entry.getField(StandardField.TITLE))); - } - - @Test - void searchWithMalformedIdThrowsException() throws Exception { - assertEquals(Optional.empty(), fetcher.performSearchById("123412345")); - } - - @Test - void searchIdentifierForSlicePaper() throws Exception { - sliceTheoremPaper.clearField(StandardField.EPRINT); - - assertEquals(ArXivIdentifier.parse("1405.2249"), fetcher.findIdentifier(sliceTheoremPaper)); - } - - @Test - void searchEmptyId() throws Exception { - assertEquals(Optional.empty(), fetcher.performSearchById("")); - } - - @Test - void searchWithHttpUrl() throws Exception { - assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("http://arxiv.org/abs/1405.2249")); - } - - @Test - void searchWithHttpsUrl() throws Exception { - assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("https://arxiv.org/abs/1405.2249")); - } - - @Test - void searchWithHttpsUrlNotTrimmed() throws Exception { - assertEquals(Optional.of(sliceTheoremPaper), fetcher.performSearchById("https : // arxiv . org / abs / 1405 . 2249 ")); - } - - @Override - public SearchBasedFetcher getFetcher() { - return fetcher; - } - - @Override - public List getTestAuthors() { - return List.of("Tobias Diez"); - } - - @Disabled("Is not supported by the current API") - @Test - @Override - public void supportsYearSearch() throws Exception { - } - - @Disabled("Is not supported by the current API") - @Test - @Override - public void supportsYearRangeSearch() throws Exception { - } - - @Override - public String getTestJournal() { - return "Journal of Geometry and Physics (2013)"; - } - - /** - * A phrase is a sequence of terms wrapped in quotes. - * Only documents that contain exactly this sequence are returned. - */ - @Test - public void supportsPhraseSearch() throws Exception { - List resultWithPhraseSearch = fetcher.performSearch("title:\"Taxonomy of Distributed\""); - List resultWithOutPhraseSearch = fetcher.performSearch("title:Taxonomy AND title:of AND title:Distributed"); - // Phrase search result has to be subset of the default search result - assertTrue(resultWithOutPhraseSearch.containsAll(resultWithPhraseSearch)); - } - - /** - * A phrase is a sequence of terms wrapped in quotes. - * Only documents that contain exactly this sequence are returned. - */ - @Test - public void supportsPhraseSearchAndMatchesExact() throws Exception { - BibEntry expected = new BibEntry(StandardEntryType.Article) - .withField(StandardField.AUTHOR, "Fauzi Adi Rafrastara and Qi Deyu") - .withField(StandardField.TITLE, "A Survey and Taxonomy of Distributed Data Mining Research Studies: A Systematic Literature Review") - .withField(StandardField.DATE, "2020-09-14") - .withField(StandardField.ABSTRACT, "Context: Data Mining (DM) method has been evolving year by year and as of today there is also the enhancement of DM technique that can be run several times faster than the traditional one, called Distributed Data Mining (DDM). It is not a new field in data processing actually, but in the recent years many researchers have been paying more attention on this area. Problems: The number of publication regarding DDM in high reputation journals and conferences has increased significantly. It makes difficult for researchers to gain a comprehensive view of DDM that require further research. Solution: We conducted a systematic literature review to map the previous research in DDM field. Our objective is to provide the motivation for new research by identifying the gap in DDM field as well as the hot area itself. Result: Our analysis came up with some conclusions by answering 7 research questions proposed in this literature review. In addition, the taxonomy of DDM research area is presented in this paper. Finally, this systematic literature review provides the statistic of development of DDM since 2000 to 2015, in which this will help the future researchers to have a comprehensive overview of current situation of DDM.") - .withField(StandardField.EPRINT, "2009.10618") - .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/2009.10618v1:PDF") - .withField(StandardField.EPRINTTYPE, "arXiv") - .withField(StandardField.EPRINTCLASS, "cs.DC") - .withField(StandardField.KEYWORDS, "cs.DC, cs.LG"); - - List resultWithPhraseSearch = fetcher.performSearch("title:\"Taxonomy of Distributed\""); - - // There is only a single paper found by searching that contains the exact sequence "Taxonomy of Distributed" in the title. - assertEquals(Collections.singletonList(expected), resultWithPhraseSearch); - } - - @Override - public PagedSearchBasedFetcher getPagedFetcher() { - return fetcher; - } - - @Test - public void supportsBooleanANDSearch() throws Exception { - BibEntry expected = new BibEntry(StandardEntryType.Article) - .withField(StandardField.AUTHOR, "Tobias Büscher and Angel L. Diez and Gerhard Gompper and Jens Elgeti") - .withField(StandardField.TITLE, "Instability and fingering of interfaces in growing tissue") - .withField(StandardField.DATE, "2020-03-10") - .withField(StandardField.ABSTRACT, "Interfaces in tissues are ubiquitous, both between tissue and environment as well as between populations of different cell types. The propagation of an interface can be driven mechanically. % e.g. by a difference in the respective homeostatic stress of the different cell types. Computer simulations of growing tissues are employed to study the stability of the interface between two tissues on a substrate. From a mechanical perspective, the dynamics and stability of this system is controlled mainly by four parameters of the respective tissues: (i) the homeostatic stress (ii) cell motility (iii) tissue viscosity and (iv) substrate friction. For propagation driven by a difference in homeostatic stress, the interface is stable for tissue-specific substrate friction even for very large differences of homeostatic stress; however, it becomes unstable above a critical stress difference when the tissue with the larger homeostatic stress has a higher viscosity. A small difference in directed bulk motility between the two tissues suffices to result in propagation with a stable interface, even for otherwise identical tissues. Larger differences in motility force, however, result in a finite-wavelength instability of the interface. Interestingly, the instability is apparently bound by nonlinear effects and the amplitude of the interface undulations only grows to a finite value in time.") - .withField(StandardField.DOI, "10.1088/1367-2630/ab9e88") - .withField(StandardField.EPRINT, "2003.04601") - .withField(StandardField.DOI, "10.1088/1367-2630/ab9e88") - .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/2003.04601v1:PDF") - .withField(StandardField.EPRINTTYPE, "arXiv") - .withField(StandardField.EPRINTCLASS, "q-bio.TO") - .withField(StandardField.KEYWORDS, "q-bio.TO"); - - List result = fetcher.performSearch("author:\"Tobias Büscher\" AND title:\"Instability and fingering of interfaces\""); - - // There is only one paper authored by Tobias Büscher with that phrase in the title - assertEquals(Collections.singletonList(expected), result); - } -} diff --git a/src/test/java/org/jabref/logic/importer/fetcher/CompositeIdFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/CompositeIdFetcherTest.java index 1883b908fa8..ce2427a21e4 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/CompositeIdFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/CompositeIdFetcherTest.java @@ -8,7 +8,9 @@ import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.InternalField; import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.field.UnknownField; import org.jabref.model.entry.types.StandardEntryType; import org.jabref.testutils.category.FetcherTest; @@ -36,16 +38,26 @@ public static Stream performSearchByIdReturnsCorrectEntryForIdentifie Arguments.arguments( "performSearchByIdReturnsCorrectEntryForArXivId", new BibEntry(StandardEntryType.Article) - .withField(StandardField.AUTHOR, "Emily C. Cunningham and Robyn E. Sanderson and Kathryn V. Johnston and Nondh Panithanpaisal and Melissa K. Ness and Andrew Wetzel and Sarah R. Loebman and Ivanna Escala and Danny Horta and Claude-André Faucher-Giguère") + .withField(StandardField.AUTHOR, "Cunningham, Emily C. and Sanderson, Robyn E. and Johnston, Kathryn V. and Panithanpaisal, Nondh and Ness, Melissa K. and Wetzel, Andrew and Loebman, Sarah R. and Escala, Ivanna and Horta, Danny and Faucher-Giguère, Claude-André") .withField(StandardField.TITLE, "Reading the CARDs: the Imprint of Accretion History in the Chemical Abundances of the Milky Way's Stellar Halo") .withField(StandardField.DATE, "2021-10-06") + .withField(StandardField.YEAR, "2021") + .withField(StandardField.MONTH, "aug") + .withField(StandardField.NUMBER, "2") + .withField(StandardField.VOLUME, "934") + .withField(StandardField.PUBLISHER, "American Astronomical Society") + .withField(StandardField.JOURNAL, "The Astrophysical Journal") + .withField(StandardField.PAGES, "172") .withField(StandardField.ABSTRACT, "In the era of large-scale spectroscopic surveys in the Local Group (LG), we can explore using chemical abundances of halo stars to study the star formation and chemical enrichment histories of the dwarf galaxy progenitors of the Milky Way (MW) and M31 stellar halos. In this paper, we investigate using the Chemical Abundance Ratio Distributions (CARDs) of seven stellar halos from the Latte suite of FIRE-2 simulations. We attempt to infer galaxies' assembly histories by modelling the CARDs of the stellar halos of the Latte galaxies as a linear combination of template CARDs from disrupted dwarfs, with different stellar masses $M_{\\star}$ and quenching times $t_{100}$. We present a method for constructing these templates using present-day dwarf galaxies. For four of the seven Latte halos studied in this work, we recover the mass spectrum of accreted dwarfs to a precision of $<10\\%$. For the fraction of mass accreted as a function of $t_{100}$, we find residuals of $20-30\\%$ for five of the seven simulations. We discuss the failure modes of this method, which arise from the diversity of star formation and chemical enrichment histories dwarf galaxies can take. These failure cases can be robustly identified by the high model residuals. Though the CARDs modeling method does not successfully infer the assembly histories in these cases, the CARDs of these disrupted dwarfs contain signatures of their unusual formation histories. Our results are promising for using CARDs to learn more about the histories of the progenitors of the MW and M31 stellar halos.") + .withField(StandardField.DOI, "10.3847/1538-4357/ac78ea") .withField(StandardField.EPRINT, "2110.02957") .withField(StandardField.DOI, "10.3847/1538-4357/ac78ea") .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/2110.02957v1:PDF") .withField(StandardField.EPRINTTYPE, "arXiv") .withField(StandardField.EPRINTCLASS, "astro-ph.GA") - .withField(StandardField.KEYWORDS, "astro-ph.GA"), + .withField(StandardField.KEYWORDS, "Astrophysics of Galaxies (astro-ph.GA), FOS: Physical sciences") + .withField(InternalField.KEY_FIELD, "Cunningham_2022") + .withField(new UnknownField("copyright"), "Creative Commons Attribution 4.0 International"), "arXiv:2110.02957" ), /* disabled, because Iacr does not work @@ -95,6 +107,8 @@ void setUp() throws Exception { ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class); FieldContentFormatterPreferences fieldContentFormatterPreferences = mock(FieldContentFormatterPreferences.class); when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn(fieldContentFormatterPreferences); + // Needed for ArXiv Fetcher keyword processing + when(importFormatPreferences.getKeywordSeparator()).thenReturn(','); compositeIdFetcher = new CompositeIdFetcher(importFormatPreferences); } diff --git a/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java index fe0db637765..6ca761f0283 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java @@ -100,7 +100,7 @@ static Stream performSearchParameters() { List> fetcherParameters = new ArrayList<>(); List list = List.of( - new ArXiv(importFormatPreferences), + new ArXivFetcher(importFormatPreferences), new INSPIREFetcher(importFormatPreferences), new GvkFetcher(), new AstrophysicsDataSystem(importFormatPreferences, importerPreferences), diff --git a/src/test/java/org/jabref/logic/importer/fetcher/SearchBasedFetcherCapabilityTest.java b/src/test/java/org/jabref/logic/importer/fetcher/SearchBasedFetcherCapabilityTest.java index 53c0838db04..3de0af37fd4 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/SearchBasedFetcherCapabilityTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/SearchBasedFetcherCapabilityTest.java @@ -85,6 +85,9 @@ default void supportsYearRangeSearch() throws Exception { /** * Test whether the library API supports journal based search. + * + * WARNING: the error while merging information from user-assigned DOI (more specifically, "10.1016/j.geomphys.2012.09.009") + * is related to a failed read by the Bibtex Parser (title is formatted in a weird way) */ @Test default void supportsJournalSearch() throws Exception { diff --git a/src/test/java/org/jabref/model/entry/BibEntryTest.java b/src/test/java/org/jabref/model/entry/BibEntryTest.java index 84ad88fff79..a27c2ccdc61 100644 --- a/src/test/java/org/jabref/model/entry/BibEntryTest.java +++ b/src/test/java/org/jabref/model/entry/BibEntryTest.java @@ -6,11 +6,14 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Optional; +import java.util.Set; import org.jabref.model.FieldChange; import org.jabref.model.database.BibDatabase; import org.jabref.model.entry.field.BibField; +import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.FieldPriority; import org.jabref.model.entry.field.InternalField; import org.jabref.model.entry.field.OrFields; @@ -657,4 +660,153 @@ void builderReturnsABibEntryNotChangedFlagged() { entry = new BibEntry().withField(StandardField.AUTHOR, "value"); assertFalse(entry.hasChanged()); } + + @Test + void mergeEntriesWithNoOverlap() { + BibEntry expected = new BibEntry() + .withField(StandardField.AUTHOR, "Test Author") + .withField(StandardField.TITLE, "Test Title") + .withField(StandardField.EPRINT, "1234.56789") + .withField(StandardField.DATE, "1970-01-01"); + + BibEntry copyEntry = (BibEntry) entry.clone(); + BibEntry otherEntry = new BibEntry(); + + copyEntry.setField(Map.of( + StandardField.AUTHOR, "Test Author", + StandardField.TITLE, "Test Title")); + + otherEntry.setField(Map.of( + StandardField.EPRINT, "1234.56789", + StandardField.DATE, "1970-01-01" + )); + + copyEntry.mergeWith(otherEntry); + assertEquals(expected.getFields(), copyEntry.getFields()); + } + + @Test + void mergeEntriesWithOverlap() { + BibEntry expected = new BibEntry() + .withField(StandardField.AUTHOR, "Test Author") + .withField(StandardField.TITLE, "Test Title") + .withField(StandardField.DATE, "1970-01-01"); + + BibEntry copyEntry = (BibEntry) entry.clone(); + BibEntry otherEntry = new BibEntry(); + + copyEntry.setField(Map.of( + StandardField.AUTHOR, "Test Author", + StandardField.TITLE, "Test Title")); + + otherEntry.setField(Map.of( + StandardField.AUTHOR, "Another Test Author", + StandardField.DATE, "1970-01-01" + )); + + copyEntry.mergeWith(otherEntry); + assertEquals(expected.getFields(), copyEntry.getFields()); + } + + @Test + void mergeEntriesWithNoOverlapAndNonExistingPriorityFields() { + BibEntry expected = new BibEntry() + .withField(StandardField.AUTHOR, "Test Author") + .withField(StandardField.TITLE, "Test Title") + .withField(StandardField.EPRINT, "1234.56789") + .withField(StandardField.DATE, "1970-01-01"); + + BibEntry copyEntry = (BibEntry) entry.clone(); + BibEntry otherEntry = new BibEntry(); + + copyEntry.setField(Map.of( + StandardField.AUTHOR, "Test Author", + StandardField.TITLE, "Test Title")); + + otherEntry.setField(Map.of( + StandardField.EPRINT, "1234.56789", + StandardField.DATE, "1970-01-01" + )); + + Set otherPrioritizedFields = Set.of(StandardField.VOLUME, StandardField.KEYWORDS); + + copyEntry.mergeWith(otherEntry, otherPrioritizedFields); + assertEquals(expected.getFields(), copyEntry.getFields()); + } + + @Test + void mergeEntriesWithNoOverlapAndExistingPriorityFields() { + BibEntry expected = new BibEntry() + .withField(StandardField.AUTHOR, "Test Author") + .withField(StandardField.TITLE, "Test Title") + .withField(StandardField.EPRINT, "1234.56789") + .withField(StandardField.DATE, "1970-01-01"); + + BibEntry copyEntry = (BibEntry) entry.clone(); + BibEntry otherEntry = new BibEntry(); + + copyEntry.setField(Map.of( + StandardField.AUTHOR, "Test Author", + StandardField.TITLE, "Test Title")); + + otherEntry.setField(Map.of( + StandardField.EPRINT, "1234.56789", + StandardField.DATE, "1970-01-01" + )); + + Set otherPrioritizedFields = Set.of(StandardField.AUTHOR, StandardField.EPRINT); + + copyEntry.mergeWith(otherEntry, otherPrioritizedFields); + assertEquals(expected.getFields(), copyEntry.getFields()); + } + + @Test + void mergeEntriesWithOverlapAndPriorityGivenToNonOverlappingField() { + BibEntry expected = new BibEntry() + .withField(StandardField.AUTHOR, "Test Author") + .withField(StandardField.TITLE, "Test Title") + .withField(StandardField.DATE, "1970-01-01"); + + BibEntry copyEntry = (BibEntry) entry.clone(); + BibEntry otherEntry = new BibEntry(); + + copyEntry.setField(Map.of( + StandardField.AUTHOR, "Test Author", + StandardField.TITLE, "Test Title")); + + otherEntry.setField(Map.of( + StandardField.AUTHOR, "Another Test Author", + StandardField.DATE, "1970-01-01" + )); + + Set otherPrioritizedFields = Set.of(StandardField.TITLE, StandardField.DATE); + + copyEntry.mergeWith(otherEntry, otherPrioritizedFields); + assertEquals(expected.getFields(), copyEntry.getFields()); + } + + @Test + void mergeEntriesWithOverlapAndPriorityGivenToOverlappingField() { + BibEntry expected = new BibEntry() + .withField(StandardField.AUTHOR, "Another Test Author") + .withField(StandardField.TITLE, "Test Title") + .withField(StandardField.DATE, "1970-01-01"); + + BibEntry copyEntry = (BibEntry) entry.clone(); + BibEntry otherEntry = new BibEntry(); + + copyEntry.setField(Map.of( + StandardField.AUTHOR, "Test Author", + StandardField.TITLE, "Test Title")); + + otherEntry.setField(Map.of( + StandardField.AUTHOR, "Another Test Author", + StandardField.DATE, "1970-01-01" + )); + + Set otherPrioritizedFields = Set.of(StandardField.AUTHOR, StandardField.DATE); + + copyEntry.mergeWith(otherEntry, otherPrioritizedFields); + assertEquals(expected.getFields(), copyEntry.getFields()); + } } diff --git a/src/test/resources/tinylog-test.properties b/src/test/resources/tinylog-test.properties index 5d580acbfac..640a69a1005 100644 --- a/src/test/resources/tinylog-test.properties +++ b/src/test/resources/tinylog-test.properties @@ -3,3 +3,4 @@ writer = console # uncomment the following line to enable debug logging outputs #level@org.jabref.model.entry.BibEntry = debug #level@org.jabref.logic.importer.fetcher.ResearchGate = trace +#level@org.jabref.logic.importer.fetcher.DoiFetcher = trace