Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions fuji_server/data/repodois.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
arch.avanjj: https://doi.org/10.17616/R31NJMYW
arch.igsn: https://doi.org/10.17616/R31NJMYW
ardcx.aims: https://doi.org/10.17616/R3X908
ardcx.curtin: https://doi.org/10.17616/R3WM02
ardcx.griffith: https://doi.org/10.17616/R3FG92
Expand Down Expand Up @@ -58,6 +57,7 @@ ccdc.csd: https://doi.org/10.17616/R36011
cern.hepdata: https://doi.org/10.17616/R30W2H
cern.inspire: https://doi.org/10.17616/R3JC9Z
cern.zenodo: https://doi.org/10.17616/R3QP53
chvf.pbyfos: https://doi.org/10.17616/R31NJNLQ
clarin.clarin: https://doi.org/10.17616/R3RP5D
cngb.cga: https://doi.org/10.17616/R31NJMGL
cngb.gigadb: https://doi.org/10.17616/R3TG83
Expand All @@ -70,7 +70,7 @@ cornell.ndacan: https://doi.org/10.17616/R3N90S
cos.osf: https://doi.org/10.17616/R3N03T
crui.ogsts: https://doi.org/10.17616/R3TR3Z
crui.unibo: https://doi.org/10.17616/R3P19R
crui.unipd: https://doi.org/10.17616/R31NJMHN
crui.unipd: https://doi.org/10.17616/R3W05R
csc.nrd: https://doi.org/10.17616/R33649
csic.digital: https://doi.org/10.17616/R3P34J
cul.ciesin: https://doi.org/10.17616/R31605
Expand Down Expand Up @@ -114,6 +114,7 @@ figshare.ariz: https://doi.org/10.17616/R31NJMSF
figshare.ars: https://doi.org/10.17616/R3PK5R
figshare.iastate: https://doi.org/10.17616/R3GV3W
figshare.uct: https://doi.org/10.17616/R36R4R
fzct.bexis: https://doi.org/10.17616/R32P9Q
fzj.b2share: https://doi.org/10.17616/R3VK72
fzj.tereno: https://doi.org/10.17616/R39G9T
gbif.ec: https://doi.org/10.17616/R31NJNFS
Expand Down Expand Up @@ -181,6 +182,7 @@ lxkc.dskyfi: https://doi.org/10.17616/R3W331
mdw.repository: https://doi.org/10.17616/R30M00
mit.physio: https://doi.org/10.17616/R3D06S
mlbs.skuxgs: https://doi.org/10.17616/R3ZP8D
mzhu.ioerdata: https://doi.org/10.17616/R31NJNL7
nasapds.nasapds: https://doi.org/10.17616/R37593
nbqj.daks: https://doi.org/10.17616/R31NJMZS
nkn.nkn: https://doi.org/10.17616/R3JK91
Expand Down Expand Up @@ -217,7 +219,6 @@ rpak.zvrzin: https://doi.org/10.17616/R31NJNLV
sagebio.synapse: https://doi.org/10.17616/R3B934
si.cda: https://doi.org/10.17616/R3201S
si.si: https://doi.org/10.17616/R3W49N
sml.tcia: https://doi.org/10.17616/R3NH0V
sml.tdar: https://doi.org/10.17616/R3HK56
snd.bolin: https://doi.org/10.17616/R3PP99
snd.icos: https://doi.org/10.17616/R3235B
Expand Down
161 changes: 161 additions & 0 deletions fuji_server/helper/identifier_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ class IdentifierHelper:
"bioproject": {"label": "BioProject ID", "source": "identifiers.org"},
"biosample": {"label": "BioSample ID", "source": "identifiers.org"},
"doi": {"label": "Digital Object Identifier (DOI)", "source": "datacite.org"},
"dpid": {"label": "Decentralized Persistent Identifier (dPID)", "source": "dpid.org"},
"ensembl": {"label": "Ensembl ID", "source": "identifiers.org"},
"genome": {"label": "GenBank or RefSeq genome", "source": "identifiers.org"},
"gnd": {"label": "Gemeinsame Normdatei (GND) ID", "source": "f-uji.net"},
"handle": {"label": "Handle System ID", "source": "datacite.org"},
"ipfs": {"label": "InterPlanetary File System Content Identifier (IPFS CID)", "source": "ipfs.io"},
"lsid": {"label": "Life Science Identifier", "source": "datacite.org"},
"pmid": {"label": "PubMed ID", "source": "datacite.org"},
"pmcid": {"label": "PubMed Central ID", "source": "identifiers.org"},
Expand All @@ -36,6 +38,10 @@ class IdentifierHelper:
"identifiers.org": {"label": "Identifiers.org Identifier", "source": "identifiers.org"},
"w3id": {"label": "Permanent Identifier for the Web (W3ID)", "source": "identifiers.org"},
}
# IPFS gateway domains for CID resolution (ordered by resolution reliability for DeSci content)
IPFS_GATEWAYS = ["ipfs.desci.com", "pub.desci.com", "dweb.link", "ipfs.io", "cloudflare-ipfs.com", "gateway.pinata.cloud"]
# dPID resolver domains
DPID_DOMAINS = ["dpid.org", "beta.dpid.org", "dev.dpid.org"]
# identifiers.org pattern
# TODO: check if this is needed.. if so ..complete and add check to FAIRcheck
IDENTIFIERS_PIDS = r"https://identifiers.org/[provider_code/]namespace:accession"
Expand Down Expand Up @@ -125,6 +131,29 @@ def __init__(self, idstring, logger=None):
self.preferred_schema = "w3id"
self.identifier_url = self.identifier
self.normalized_id = self.identifier

# dPID check - support dpid:// scheme and dpid.org URLs
elif self.is_dpid():
dpid_id = self.extract_dpid_id()
if dpid_id:
self.identifier_schemes = ["dpid", "url"]
self.preferred_schema = "dpid"
# Normalize to canonical dpid.org URL
self.identifier_url = f"https://dpid.org/{dpid_id}"
self.normalized_id = f"dpid://{dpid_id}"
self.is_persistent = True

# IPFS CID check - support ipfs:// scheme and IPFS gateway URLs
elif self.is_ipfs_cid():
cid = self.extract_ipfs_cid()
if cid:
self.identifier_schemes = ["ipfs", "url"]
self.preferred_schema = "ipfs"
# Use ipfs.io as the canonical gateway for resolution
self.identifier_url = f"https://ipfs.io/ipfs/{cid}"
self.normalized_id = f"ipfs://{cid}"
self.is_persistent = True

# identifiers.org

elif idparts.netloc == "identifiers.org":
Expand Down Expand Up @@ -206,6 +235,138 @@ def is_hash(self):
except Exception:
return False

def is_dpid(self):
"""Check if the identifier is a dPID (Decentralized Persistent Identifier).

Supports:
- dpid:// scheme (e.g., dpid://500)
- dpid.org URLs (e.g., https://dpid.org/500, https://beta.dpid.org/500)
"""
if not self.identifier:
return False
try:
# Check for dpid:// scheme (e.g., dpid://500)
if self.identifier.startswith("dpid://"):
# Extract and validate the numeric ID
path = self.identifier[7:].split("/")[0]
return path.isdigit()

# Check for dpid.org URLs
idparts = urllib.parse.urlparse(self.identifier)
netloc = idparts.netloc.lower()
# Remove port if present for comparison
netloc_no_port = netloc.split(":")[0]

if netloc_no_port in self.DPID_DOMAINS or netloc_no_port.endswith(".dpid.org"):
# Check that there's a path with an ID
path = idparts.path.strip("/")
if path and (path.isdigit() or re.match(r"^\d+(/v\d+)?$", path)):
return True
return False
except Exception:
return False

def extract_dpid_id(self):
"""Extract the dPID number from a dPID URL or scheme.

Returns the dPID ID (e.g., "500" from "https://dpid.org/500" or "dpid://500")
"""
if not self.identifier:
return None
try:
# Handle dpid:// scheme (e.g., dpid://500)
if self.identifier.startswith("dpid://"):
# Extract the first path component which should be the numeric ID
path = self.identifier[7:].split("/")[0]
return path if path.isdigit() else None

# Handle HTTP URLs
idparts = urllib.parse.urlparse(self.identifier)
path = idparts.path.strip("/")
# Extract numeric ID, handling version suffixes like /v1
match = re.match(r"^(\d+)(?:/v\d+)?$", path)
if match:
return match.group(1)
return path if path.isdigit() else None
except Exception:
return None

def is_ipfs_cid(self):
"""Check if the identifier is an IPFS Content Identifier (CID).

Supports:
- ipfs:// scheme (e.g., ipfs://bafybeic...)
- IPFS gateway URLs (e.g., https://ipfs.io/ipfs/bafybeic...)
- Raw CIDv0 (Qm...) and CIDv1 (bafy...) identifiers
"""
if not self.identifier:
return False
try:
# CIDv0 pattern: starts with 'Qm' followed by 44 base58 chars (total 46 chars)
cidv0_pattern = r"^Qm[1-9A-HJ-NP-Za-km-z]{44}$"
# CIDv1 pattern: starts with 'bafy' or 'bafk' followed by base32 chars
cidv1_pattern = r"^baf[yk][a-z2-7]{50,}$"

# Check for ipfs:// scheme
if self.identifier.startswith("ipfs://"):
cid = self.identifier[7:].split("/")[0]
return bool(re.match(cidv0_pattern, cid) or re.match(cidv1_pattern, cid))

# Check for IPFS gateway URLs
idparts = urllib.parse.urlparse(self.identifier)
netloc = idparts.netloc.lower()
path = idparts.path

# Check if it's an IPFS gateway URL
if any(gateway in netloc for gateway in self.IPFS_GATEWAYS) or "/ipfs/" in path:
# Extract CID from path
if "/ipfs/" in path:
cid = path.split("/ipfs/")[1].split("/")[0]
return bool(re.match(cidv0_pattern, cid) or re.match(cidv1_pattern, cid))

# Check if raw identifier is a CID
raw_id = self.identifier.strip()
return bool(re.match(cidv0_pattern, raw_id) or re.match(cidv1_pattern, raw_id))

except Exception:
return False

def extract_ipfs_cid(self):
"""Extract the IPFS CID from an IPFS URL, scheme, or raw identifier.

Returns the CID (e.g., "bafybeic..." from "https://ipfs.io/ipfs/bafybeic...")
"""
if not self.identifier:
return None
try:
# CIDv0 pattern
cidv0_pattern = r"(Qm[1-9A-HJ-NP-Za-km-z]{44})"
# CIDv1 pattern
cidv1_pattern = r"(baf[yk][a-z2-7]{50,})"

# Handle ipfs:// scheme
if self.identifier.startswith("ipfs://"):
cid = self.identifier[7:].split("/")[0]
return cid

# Handle IPFS gateway URLs
if "/ipfs/" in self.identifier:
parts = self.identifier.split("/ipfs/")
if len(parts) > 1:
cid = parts[1].split("/")[0]
return cid

# Check if raw identifier is a CID
raw_id = self.identifier.strip()
if re.match(cidv0_pattern, raw_id):
return re.match(cidv0_pattern, raw_id).group(1)
if re.match(cidv1_pattern, raw_id):
return re.match(cidv1_pattern, raw_id).group(1)

return None
except Exception:
return None

def verify_handle(self, val, includeparams=True):
# additional checks for handles since the syntax is very generic
try:
Expand Down
Loading