diff --git a/fuji_server/data/repodois.yaml b/fuji_server/data/repodois.yaml index 785b0f55..2fe178b4 100644 --- a/fuji_server/data/repodois.yaml +++ b/fuji_server/data/repodois.yaml @@ -1,5 +1,4 @@ arch.avanjj: https://doi.org/10.17616/R31NJMYW -arch.igsn: https://doi.org/10.17616/R31NJMYW ardcx.aims: https://doi.org/10.17616/R3X908 ardcx.curtin: https://doi.org/10.17616/R3WM02 ardcx.griffith: https://doi.org/10.17616/R3FG92 @@ -58,6 +57,7 @@ ccdc.csd: https://doi.org/10.17616/R36011 cern.hepdata: https://doi.org/10.17616/R30W2H cern.inspire: https://doi.org/10.17616/R3JC9Z cern.zenodo: https://doi.org/10.17616/R3QP53 +chvf.pbyfos: https://doi.org/10.17616/R31NJNLQ clarin.clarin: https://doi.org/10.17616/R3RP5D cngb.cga: https://doi.org/10.17616/R31NJMGL cngb.gigadb: https://doi.org/10.17616/R3TG83 @@ -70,7 +70,7 @@ cornell.ndacan: https://doi.org/10.17616/R3N90S cos.osf: https://doi.org/10.17616/R3N03T crui.ogsts: https://doi.org/10.17616/R3TR3Z crui.unibo: https://doi.org/10.17616/R3P19R -crui.unipd: https://doi.org/10.17616/R31NJMHN +crui.unipd: https://doi.org/10.17616/R3W05R csc.nrd: https://doi.org/10.17616/R33649 csic.digital: https://doi.org/10.17616/R3P34J cul.ciesin: https://doi.org/10.17616/R31605 @@ -114,6 +114,7 @@ figshare.ariz: https://doi.org/10.17616/R31NJMSF figshare.ars: https://doi.org/10.17616/R3PK5R figshare.iastate: https://doi.org/10.17616/R3GV3W figshare.uct: https://doi.org/10.17616/R36R4R +fzct.bexis: https://doi.org/10.17616/R32P9Q fzj.b2share: https://doi.org/10.17616/R3VK72 fzj.tereno: https://doi.org/10.17616/R39G9T gbif.ec: https://doi.org/10.17616/R31NJNFS @@ -181,6 +182,7 @@ lxkc.dskyfi: https://doi.org/10.17616/R3W331 mdw.repository: https://doi.org/10.17616/R30M00 mit.physio: https://doi.org/10.17616/R3D06S mlbs.skuxgs: https://doi.org/10.17616/R3ZP8D +mzhu.ioerdata: https://doi.org/10.17616/R31NJNL7 nasapds.nasapds: https://doi.org/10.17616/R37593 nbqj.daks: https://doi.org/10.17616/R31NJMZS nkn.nkn: https://doi.org/10.17616/R3JK91 @@ -217,7 +219,6 @@ rpak.zvrzin: https://doi.org/10.17616/R31NJNLV sagebio.synapse: https://doi.org/10.17616/R3B934 si.cda: https://doi.org/10.17616/R3201S si.si: https://doi.org/10.17616/R3W49N -sml.tcia: https://doi.org/10.17616/R3NH0V sml.tdar: https://doi.org/10.17616/R3HK56 snd.bolin: https://doi.org/10.17616/R3PP99 snd.icos: https://doi.org/10.17616/R3235B diff --git a/fuji_server/helper/identifier_helper.py b/fuji_server/helper/identifier_helper.py index 71e5c947..3ebcb791 100644 --- a/fuji_server/helper/identifier_helper.py +++ b/fuji_server/helper/identifier_helper.py @@ -21,10 +21,12 @@ class IdentifierHelper: "bioproject": {"label": "BioProject ID", "source": "identifiers.org"}, "biosample": {"label": "BioSample ID", "source": "identifiers.org"}, "doi": {"label": "Digital Object Identifier (DOI)", "source": "datacite.org"}, + "dpid": {"label": "Decentralized Persistent Identifier (dPID)", "source": "dpid.org"}, "ensembl": {"label": "Ensembl ID", "source": "identifiers.org"}, "genome": {"label": "GenBank or RefSeq genome", "source": "identifiers.org"}, "gnd": {"label": "Gemeinsame Normdatei (GND) ID", "source": "f-uji.net"}, "handle": {"label": "Handle System ID", "source": "datacite.org"}, + "ipfs": {"label": "InterPlanetary File System Content Identifier (IPFS CID)", "source": "ipfs.io"}, "lsid": {"label": "Life Science Identifier", "source": "datacite.org"}, "pmid": {"label": "PubMed ID", "source": "datacite.org"}, "pmcid": {"label": "PubMed Central ID", "source": "identifiers.org"}, @@ -36,6 +38,10 @@ class IdentifierHelper: "identifiers.org": {"label": "Identifiers.org Identifier", "source": "identifiers.org"}, "w3id": {"label": "Permanent Identifier for the Web (W3ID)", "source": "identifiers.org"}, } + # IPFS gateway domains for CID resolution (ordered by resolution reliability for DeSci content) + IPFS_GATEWAYS = ["ipfs.desci.com", "pub.desci.com", "dweb.link", "ipfs.io", "cloudflare-ipfs.com", "gateway.pinata.cloud"] + # dPID resolver domains + DPID_DOMAINS = ["dpid.org", "beta.dpid.org", "dev.dpid.org"] # identifiers.org pattern # TODO: check if this is needed.. if so ..complete and add check to FAIRcheck IDENTIFIERS_PIDS = r"https://identifiers.org/[provider_code/]namespace:accession" @@ -125,6 +131,29 @@ def __init__(self, idstring, logger=None): self.preferred_schema = "w3id" self.identifier_url = self.identifier self.normalized_id = self.identifier + + # dPID check - support dpid:// scheme and dpid.org URLs + elif self.is_dpid(): + dpid_id = self.extract_dpid_id() + if dpid_id: + self.identifier_schemes = ["dpid", "url"] + self.preferred_schema = "dpid" + # Normalize to canonical dpid.org URL + self.identifier_url = f"https://dpid.org/{dpid_id}" + self.normalized_id = f"dpid://{dpid_id}" + self.is_persistent = True + + # IPFS CID check - support ipfs:// scheme and IPFS gateway URLs + elif self.is_ipfs_cid(): + cid = self.extract_ipfs_cid() + if cid: + self.identifier_schemes = ["ipfs", "url"] + self.preferred_schema = "ipfs" + # Use ipfs.io as the canonical gateway for resolution + self.identifier_url = f"https://ipfs.io/ipfs/{cid}" + self.normalized_id = f"ipfs://{cid}" + self.is_persistent = True + # identifiers.org elif idparts.netloc == "identifiers.org": @@ -206,6 +235,138 @@ def is_hash(self): except Exception: return False + def is_dpid(self): + """Check if the identifier is a dPID (Decentralized Persistent Identifier). + + Supports: + - dpid:// scheme (e.g., dpid://500) + - dpid.org URLs (e.g., https://dpid.org/500, https://beta.dpid.org/500) + """ + if not self.identifier: + return False + try: + # Check for dpid:// scheme (e.g., dpid://500) + if self.identifier.startswith("dpid://"): + # Extract and validate the numeric ID + path = self.identifier[7:].split("/")[0] + return path.isdigit() + + # Check for dpid.org URLs + idparts = urllib.parse.urlparse(self.identifier) + netloc = idparts.netloc.lower() + # Remove port if present for comparison + netloc_no_port = netloc.split(":")[0] + + if netloc_no_port in self.DPID_DOMAINS or netloc_no_port.endswith(".dpid.org"): + # Check that there's a path with an ID + path = idparts.path.strip("/") + if path and (path.isdigit() or re.match(r"^\d+(/v\d+)?$", path)): + return True + return False + except Exception: + return False + + def extract_dpid_id(self): + """Extract the dPID number from a dPID URL or scheme. + + Returns the dPID ID (e.g., "500" from "https://dpid.org/500" or "dpid://500") + """ + if not self.identifier: + return None + try: + # Handle dpid:// scheme (e.g., dpid://500) + if self.identifier.startswith("dpid://"): + # Extract the first path component which should be the numeric ID + path = self.identifier[7:].split("/")[0] + return path if path.isdigit() else None + + # Handle HTTP URLs + idparts = urllib.parse.urlparse(self.identifier) + path = idparts.path.strip("/") + # Extract numeric ID, handling version suffixes like /v1 + match = re.match(r"^(\d+)(?:/v\d+)?$", path) + if match: + return match.group(1) + return path if path.isdigit() else None + except Exception: + return None + + def is_ipfs_cid(self): + """Check if the identifier is an IPFS Content Identifier (CID). + + Supports: + - ipfs:// scheme (e.g., ipfs://bafybeic...) + - IPFS gateway URLs (e.g., https://ipfs.io/ipfs/bafybeic...) + - Raw CIDv0 (Qm...) and CIDv1 (bafy...) identifiers + """ + if not self.identifier: + return False + try: + # CIDv0 pattern: starts with 'Qm' followed by 44 base58 chars (total 46 chars) + cidv0_pattern = r"^Qm[1-9A-HJ-NP-Za-km-z]{44}$" + # CIDv1 pattern: starts with 'bafy' or 'bafk' followed by base32 chars + cidv1_pattern = r"^baf[yk][a-z2-7]{50,}$" + + # Check for ipfs:// scheme + if self.identifier.startswith("ipfs://"): + cid = self.identifier[7:].split("/")[0] + return bool(re.match(cidv0_pattern, cid) or re.match(cidv1_pattern, cid)) + + # Check for IPFS gateway URLs + idparts = urllib.parse.urlparse(self.identifier) + netloc = idparts.netloc.lower() + path = idparts.path + + # Check if it's an IPFS gateway URL + if any(gateway in netloc for gateway in self.IPFS_GATEWAYS) or "/ipfs/" in path: + # Extract CID from path + if "/ipfs/" in path: + cid = path.split("/ipfs/")[1].split("/")[0] + return bool(re.match(cidv0_pattern, cid) or re.match(cidv1_pattern, cid)) + + # Check if raw identifier is a CID + raw_id = self.identifier.strip() + return bool(re.match(cidv0_pattern, raw_id) or re.match(cidv1_pattern, raw_id)) + + except Exception: + return False + + def extract_ipfs_cid(self): + """Extract the IPFS CID from an IPFS URL, scheme, or raw identifier. + + Returns the CID (e.g., "bafybeic..." from "https://ipfs.io/ipfs/bafybeic...") + """ + if not self.identifier: + return None + try: + # CIDv0 pattern + cidv0_pattern = r"(Qm[1-9A-HJ-NP-Za-km-z]{44})" + # CIDv1 pattern + cidv1_pattern = r"(baf[yk][a-z2-7]{50,})" + + # Handle ipfs:// scheme + if self.identifier.startswith("ipfs://"): + cid = self.identifier[7:].split("/")[0] + return cid + + # Handle IPFS gateway URLs + if "/ipfs/" in self.identifier: + parts = self.identifier.split("/ipfs/") + if len(parts) > 1: + cid = parts[1].split("/")[0] + return cid + + # Check if raw identifier is a CID + raw_id = self.identifier.strip() + if re.match(cidv0_pattern, raw_id): + return re.match(cidv0_pattern, raw_id).group(1) + if re.match(cidv1_pattern, raw_id): + return re.match(cidv1_pattern, raw_id).group(1) + + return None + except Exception: + return None + def verify_handle(self, val, includeparams=True): # additional checks for handles since the syntax is very generic try: diff --git a/tests/helper/test_identifier_helper.py b/tests/helper/test_identifier_helper.py new file mode 100644 index 00000000..f33e5b30 --- /dev/null +++ b/tests/helper/test_identifier_helper.py @@ -0,0 +1,240 @@ +# SPDX-FileCopyrightText: 2020 PANGAEA (https://www.pangaea.de/) +# +# SPDX-License-Identifier: MIT + +""" +Tests for IdentifierHelper dPID (Decentralized Persistent Identifier) and +IPFS CID (InterPlanetary File System Content Identifier) support. +""" + +import pytest + +from fuji_server.helper.identifier_helper import IdentifierHelper + + +class TestDPID: + """Tests for dPID (Decentralized Persistent Identifier) detection and extraction.""" + + # Valid dPID identifiers + @pytest.mark.parametrize( + "identifier,expected_id", + [ + # dpid:// scheme + ("dpid://500", "500"), + ("dpid://1", "1"), + ("dpid://12345", "12345"), + # dpid.org URLs + ("https://dpid.org/500", "500"), + ("https://dpid.org/1", "1"), + ("http://dpid.org/500", "500"), + # beta.dpid.org (production resolver) + ("https://beta.dpid.org/500", "500"), + ("https://beta.dpid.org/123", "123"), + # dev.dpid.org + ("https://dev.dpid.org/500", "500"), + # With version suffix + ("https://dpid.org/500/v1", "500"), + ("https://dpid.org/500/v2", "500"), + ], + ) + def test_valid_dpid_detection(self, identifier, expected_id): + """Test that valid dPID identifiers are correctly detected.""" + helper = IdentifierHelper(identifier) + assert helper.is_dpid(), f"Expected {identifier} to be detected as dPID" + assert helper.extract_dpid_id() == expected_id + assert helper.preferred_schema == "dpid" + assert "dpid" in helper.identifier_schemes + assert helper.is_persistent is True + assert helper.normalized_id == f"dpid://{expected_id}" + assert helper.identifier_url == f"https://dpid.org/{expected_id}" + + # Invalid dPID identifiers - should NOT match + @pytest.mark.parametrize( + "identifier", + [ + # Not dPID URLs + "https://example.com/500", + "https://doi.org/10.1234/test", + "https://zenodo.org/record/123456", + # Invalid dpid:// scheme formats + "dpid://", + "dpid://abc", # non-numeric + "dpid://beta/500", # beta prefix not supported in scheme + # Invalid dpid.org paths + "https://dpid.org/", + "https://dpid.org/abc", + "https://dpid.org/test/path", + # Other identifier types + "10.1234/test.doi", + "ark:/12345/test", + "urn:isbn:0451450523", + # IPFS CIDs (should not match dPID) + "ipfs://bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi", + "https://ipfs.io/ipfs/QmTest123456789012345678901234567890123456", + ], + ) + def test_invalid_dpid_detection(self, identifier): + """Test that non-dPID identifiers are not detected as dPID.""" + helper = IdentifierHelper(identifier) + assert not helper.is_dpid(), f"Expected {identifier} to NOT be detected as dPID" + + def test_dpid_with_subdomains(self): + """Test dPID detection with various subdomains.""" + # Anything ending in .dpid.org should be detected + helper = IdentifierHelper("https://custom.dpid.org/500") + assert helper.is_dpid() + assert helper.extract_dpid_id() == "500" + + +class TestIPFSCID: + """Tests for IPFS CID (Content Identifier) detection and extraction.""" + + # Example CIDv0 (base58, starts with Qm, 46 chars) + VALID_CIDV0 = "QmYwAPJzv5CZsnA625s3Xf2nemtYgPpHdWEz79ojWnPbdG" + # Example CIDv1 (base32, starts with bafy) + VALID_CIDV1 = "bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi" + + @pytest.mark.parametrize( + "identifier_template,cid_version", + [ + # ipfs:// scheme + ("ipfs://{cid}", "both"), + ("ipfs://{cid}/path/to/file.txt", "both"), + # Gateway URLs + ("https://ipfs.io/ipfs/{cid}", "both"), + ("https://ipfs.desci.com/ipfs/{cid}", "both"), + ("https://pub.desci.com/ipfs/{cid}", "both"), + ("https://dweb.link/ipfs/{cid}", "both"), + ("https://cloudflare-ipfs.com/ipfs/{cid}", "both"), + ("https://gateway.pinata.cloud/ipfs/{cid}", "both"), + # With file paths + ("https://ipfs.io/ipfs/{cid}/data/file.json", "both"), + # Raw CIDs + ("{cid}", "both"), + ], + ) + def test_valid_ipfs_cid_detection(self, identifier_template, cid_version): + """Test that valid IPFS CIDs are correctly detected.""" + cids_to_test = [] + if cid_version in ["both", "v0"]: + cids_to_test.append(self.VALID_CIDV0) + if cid_version in ["both", "v1"]: + cids_to_test.append(self.VALID_CIDV1) + + for cid in cids_to_test: + identifier = identifier_template.format(cid=cid) + helper = IdentifierHelper(identifier) + assert helper.is_ipfs_cid(), f"Expected {identifier} to be detected as IPFS CID" + assert helper.extract_ipfs_cid() == cid + assert helper.preferred_schema == "ipfs" + assert "ipfs" in helper.identifier_schemes + assert helper.is_persistent is True + assert helper.normalized_id == f"ipfs://{cid}" + + # Invalid IPFS identifiers - should NOT match + @pytest.mark.parametrize( + "identifier", + [ + # Not IPFS URLs + "https://example.com/file.txt", + "https://doi.org/10.1234/test", + # Invalid CID formats + "ipfs://", + "ipfs://invalid", + "ipfs://Qm", # Too short + "ipfs://QmTooShort", # Invalid length + "ipfs://bafyshort", # Too short for CIDv1 + # Other gateway URLs without valid CID + "https://ipfs.io/ipfs/invalid", + "https://ipfs.io/ipfs/", + # dPIDs (should not match IPFS) + "dpid://500", + "https://dpid.org/500", + # DOIs + "10.1234/test.doi", + "https://doi.org/10.1234/test", + # Random strings + "just-a-random-string", + "12345678901234567890", + ], + ) + def test_invalid_ipfs_cid_detection(self, identifier): + """Test that non-IPFS identifiers are not detected as IPFS CID.""" + helper = IdentifierHelper(identifier) + assert not helper.is_ipfs_cid(), f"Expected {identifier} to NOT be detected as IPFS CID" + + def test_cidv0_format(self): + """Test CIDv0 format validation (starts with Qm, 46 chars, base58).""" + # Valid CIDv0 + helper = IdentifierHelper(self.VALID_CIDV0) + assert helper.is_ipfs_cid() + assert helper.extract_ipfs_cid() == self.VALID_CIDV0 + + # Invalid CIDv0 - wrong prefix + helper = IdentifierHelper("Xm" + self.VALID_CIDV0[2:]) + assert not helper.is_ipfs_cid() + + # Invalid CIDv0 - wrong length + helper = IdentifierHelper(self.VALID_CIDV0[:40]) + assert not helper.is_ipfs_cid() + + def test_cidv1_format(self): + """Test CIDv1 format validation (starts with bafy/bafk, base32).""" + # Valid CIDv1 with bafy prefix + helper = IdentifierHelper(self.VALID_CIDV1) + assert helper.is_ipfs_cid() + assert helper.extract_ipfs_cid() == self.VALID_CIDV1 + + # Valid CIDv1 with bafk prefix + bafk_cid = "bafkreigaknpexyvxt76zgkitavbwx6ejgfheup5oybpm77f3pxzrvwpfdi" + helper = IdentifierHelper(bafk_cid) + assert helper.is_ipfs_cid() + assert helper.extract_ipfs_cid() == bafk_cid + + def test_ipfs_gateway_order(self): + """Test that IPFS gateways are ordered correctly for DeSci content resolution.""" + # ipfs.desci.com should be first for best DeSci content resolution + assert IdentifierHelper.IPFS_GATEWAYS[0] == "ipfs.desci.com" + assert IdentifierHelper.IPFS_GATEWAYS[1] == "pub.desci.com" + # dweb and ipfs.io should come after DeSci gateways + assert "dweb.link" in IdentifierHelper.IPFS_GATEWAYS + assert "ipfs.io" in IdentifierHelper.IPFS_GATEWAYS + + +class TestIdentifierHelperIntegration: + """Integration tests to ensure dPID and IPFS don't conflict with other identifiers.""" + + def test_doi_not_detected_as_dpid_or_ipfs(self): + """DOIs should be detected as DOIs, not dPID or IPFS.""" + helper = IdentifierHelper("10.1234/test.doi") + assert helper.preferred_schema == "doi" + assert not helper.is_dpid() + assert not helper.is_ipfs_cid() + + def test_handle_not_detected_as_dpid_or_ipfs(self): + """Handles should be detected as handles, not dPID or IPFS.""" + helper = IdentifierHelper("hdl:10.1234/test") + assert not helper.is_dpid() + assert not helper.is_ipfs_cid() + + def test_ark_not_detected_as_dpid_or_ipfs(self): + """ARKs should be detected as ARKs, not dPID or IPFS.""" + helper = IdentifierHelper("ark:/12345/test") + assert helper.preferred_schema == "ark" + assert not helper.is_dpid() + assert not helper.is_ipfs_cid() + + def test_uuid_not_detected_as_dpid_or_ipfs(self): + """UUIDs should not be detected as dPID or IPFS.""" + helper = IdentifierHelper("550e8400-e29b-41d4-a716-446655440000") + assert helper.preferred_schema == "uuid" + assert not helper.is_dpid() + assert not helper.is_ipfs_cid() + + def test_w3id_not_detected_as_dpid_or_ipfs(self): + """W3ID URLs should be detected as W3ID, not dPID or IPFS.""" + helper = IdentifierHelper("https://w3id.org/example/test") + assert helper.preferred_schema == "w3id" + assert not helper.is_dpid() + assert not helper.is_ipfs_cid() +