diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java index 1518b6326fb..ca99dafabf4 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java +++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/ArXivFetcher.java @@ -42,6 +42,7 @@ import org.jabref.model.entry.field.InternalField; import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.logic.importer.util.UrlIdentifierParser; import org.jabref.model.entry.identifier.DOI; import org.jabref.model.entry.types.StandardEntryType; import org.jabref.model.paging.Page; @@ -339,7 +340,7 @@ public Page performSearchPaged(BaseQueryNode queryNode, int pageNumber public Optional performSearchById(String identifier) throws FetcherException { CompletableFuture> arXivBibEntryPromise = arXiv.asyncPerformSearchById(identifier); if (this.doiFetcher != null) { - inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, ArXivIdentifier.parse(identifier)); + inplaceAsyncInfuseArXivWithDoi(arXivBibEntryPromise, UrlIdentifierParser.parseArXiv(identifier)); } return arXivBibEntryPromise.join(); } diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java index a1c640dfc48..0fc26340ffc 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java +++ b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java @@ -31,6 +31,7 @@ import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; +import org.jabref.logic.importer.util.UrlIdentifierParser; import org.jabref.model.entry.types.StandardEntryType; import org.jabref.model.util.DummyFileUpdateMonitor; import org.jabref.model.util.OptionalUtil; @@ -88,7 +89,7 @@ public Optional getHelpPage() { private void doAPILimiting(String identifier) { // Without a generic API Rate Limiter implemented on the project, use Guava's RateLimiter for avoiding // API throttling when multiple threads are working, specially during DOI Content Negotiations - Optional doi = DOI.parse(identifier); + Optional doi = UrlIdentifierParser.parseDOI(identifier); try { Optional agency; @@ -121,7 +122,7 @@ protected CompletableFuture> asyncPerformSearchById(String id @Override public Optional performSearchById(String identifier) throws FetcherException { - DOI doi = DOI.parse(identifier) + DOI doi = UrlIdentifierParser.parseDOI(identifier) .orElseThrow(() -> new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier))); URL doiURL; @@ -141,7 +142,7 @@ public Optional performSearchById(String identifier) throws FetcherExc throw new FetcherException("Invalid URL", e); } if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) { - return new Medra().performSearchById(identifier); + return new Medra().performSearchById(doi.asString()); } URLDownload download = getUrlDownload(doiURL); diff --git a/jablib/src/main/java/org/jabref/logic/importer/util/UrlIdentifierParser.java b/jablib/src/main/java/org/jabref/logic/importer/util/UrlIdentifierParser.java new file mode 100644 index 00000000000..9430b0e2f6f --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/util/UrlIdentifierParser.java @@ -0,0 +1,59 @@ +package org.jabref.logic.importer.util; + +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.model.entry.identifier.DOI; + +/** + * Parses identifiers from URLs and plain text. + * Extracts DOI, arXiv ID, etc. from various URL formats. + */ +public class UrlIdentifierParser { + + private static final Pattern DOI_URL_PATTERN = + Pattern.compile("https?://(?:dx\\.)?doi\\.org/(.+)"); + + private static final Pattern DOI_ACM_PATTERN = + Pattern.compile("https?://dl\\.acm\\.org/doi/(?:abs/)?(.+)"); + + private static final Pattern ARXIV_URL_PATTERN = + Pattern.compile("https?://arxiv\\.org/(?:abs|pdf)/([\\w.\\-]+?)(?:\\.pdf)?$"); + + public static Optional parseDOI(String input) { + if (input == null || input.isBlank()) { + return Optional.empty(); + } + + String trimmedInput = input.trim(); + + Matcher doiUrlMatcher = DOI_URL_PATTERN.matcher(trimmedInput); + if (doiUrlMatcher.find()) { + return DOI.parse(doiUrlMatcher.group(1)); + } + + Matcher acmMatcher = DOI_ACM_PATTERN.matcher(trimmedInput); + if (acmMatcher.find()) { + return DOI.parse(acmMatcher.group(1)); + } + + return DOI.parse(trimmedInput); + } + + public static Optional parseArXiv(String input) { + if (input == null || input.isBlank()) { + return Optional.empty(); + } + + String trimmedInput = input.trim(); + + Matcher arxivMatcher = ARXIV_URL_PATTERN.matcher(trimmedInput); + if (arxivMatcher.find()) { + return ArXivIdentifier.parse(arxivMatcher.group(1)); + } + + return ArXivIdentifier.parse(trimmedInput); + } +} diff --git a/jablib/src/test/java/org/jabref/logic/importer/util/UrlIdentifierParserTest.java b/jablib/src/test/java/org/jabref/logic/importer/util/UrlIdentifierParserTest.java new file mode 100644 index 00000000000..d3dd6b85ae3 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/importer/util/UrlIdentifierParserTest.java @@ -0,0 +1,103 @@ +package org.jabref.logic.importer.util; + +import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.model.entry.identifier.DOI; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class UrlIdentifierParserTest { + + @Test + void parseDOIFromPlainDOI() { + String input = "10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromDoiOrgURL() { + String input = "https://doi.org/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromDxDoiOrgURL() { + String input = "https://dx.doi.org/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromHTTPURL() { + String input = "http://doi.org/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromACMDigitalLibrary() { + String input = "https://dl.acm.org/doi/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIFromACMAbsURL() { + String input = "https://dl.acm.org/doi/abs/10.1145/3544548.3580995"; + assertTrue(UrlIdentifierParser.parseDOI(input).isPresent()); + } + + @Test + void parseDOIReturnsEmptyForNull() { + assertFalse(UrlIdentifierParser.parseDOI(null).isPresent()); + } + + @Test + void parseDOIReturnsEmptyForEmptyString() { + assertFalse(UrlIdentifierParser.parseDOI("").isPresent()); + } + + @Test + void parseDOIReturnsEmptyForInvalidURL() { + assertFalse(UrlIdentifierParser.parseDOI("https://example.com").isPresent()); + } + + @Test + void parseArXivFromPlainID() { + String input = "2203.02155"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } + + @Test + void parseArXivFromAbsURL() { + String input = "https://arxiv.org/abs/2203.02155"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } + + @Test + void parseArXivFromPDFURL() { + String input = "https://arxiv.org/pdf/2203.02155.pdf"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } + + @Test + void parseArXivFromHTTPURL() { + String input = "http://arxiv.org/abs/2203.02155"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } + + @Test + void parseArXivReturnsEmptyForNull() { + assertFalse(UrlIdentifierParser.parseArXiv(null).isPresent()); + } + + @Test + void parseArXivReturnsEmptyForInvalidURL() { + assertFalse(UrlIdentifierParser.parseArXiv("https://example.com").isPresent()); + } + + @Test + void parseArXivHandlesOldIDFormat() { + String input = "https://arxiv.org/abs/math.GT/0309136"; + assertTrue(UrlIdentifierParser.parseArXiv(input).isPresent()); + } +}