opencitations · rempairamore · Jun 24, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,29 @@
+# Python caches / build artifacts
+__pycache__/
+*.py[cod]
+*.so
+build/
+dist/
+*.egg-info/
+
+# Virtual environments — must NOT clobber the image-built venv
+.venv/
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+
+# Runtime spool cache
+**/spool/
+
+# VCS / CI / IDE
+.git/
+.github/
+.vscode/
+.idea/
+
+# Docs site build artifacts
+docs/node_modules/
+docs/.astro/
+docs/dist/
diff --git a/Dockerfile b/Dockerfile
@@ -1,41 +1,44 @@
 # Base image: Python slim for a lightweight container
 FROM python:3.11-slim
 
-# Define environment variables with default values
-# These can be overridden during container runtime
+# Environment defaults (overridable at runtime)
 ENV BASE_URL="lode.opencitations.net"
-
-# Ensure Python output is unbuffered
 ENV PYTHONUNBUFFERED=1
 
-# Install system dependencies + uv
+# System dependencies + uv (installed to a system path so a non-root user can use it)
 RUN apt-get update && \
-    apt-get install -y \
-    git \
-    python3-dev \
-    build-essential \
-    curl && \
-    curl -LsSf https://astral.sh/uv/install.sh | sh && \
+    apt-get install -y --no-install-recommends \
+        git \
+        python3-dev \
+        build-essential \
+        curl && \
+    curl -LsSf https://astral.sh/uv/install.sh | \
+        env UV_INSTALL_DIR=/usr/local/bin INSTALLER_NO_MODIFY_PATH=1 sh && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-# Make uv available in PATH
-ENV PATH="/root/.local/bin:$PATH"
+# Non-root user that owns the app and runs the server
+RUN useradd --create-home --uid 10001 appuser
 
-# Set the working directory for our application
 WORKDIR /website
+RUN chown appuser:appuser /website
 
-# Copy dependency files first for better Docker layer caching
-COPY pyproject.toml uv.lock README.md ./
+# Drop privileges before building and running: the venv and the runtime spool
+# directory end up owned by appuser, so no root is needed at any point.
+USER appuser
 
-# Install dependencies (frozen = use exact lockfile versions)
+# Install dependencies first for better layer caching (frozen = exact lockfile)
+COPY --chown=appuser:appuser pyproject.toml uv.lock README.md ./
 RUN uv sync --frozen --no-dev --no-install-project
 
-# Copy application code
-COPY . .
+# Application code
+COPY --chown=appuser:appuser . .
+
+# At runtime uv must only launch the entrypoint, never re-sync the prebuilt env
+ENV UV_NO_SYNC=1
 
-# Expose the port that our service will listen on
+# Service port (>1024, bindable by a non-root user)
 EXPOSE 8080
 
 # Start the application with gunicorn via uv
-CMD ["uv", "run", "gunicorn", "-c", "gunicorn.conf.py", "lode.api:app"]
+CMD ["uv", "run", "gunicorn", "-c", "gunicorn.conf.py", "lode.api:app"]
diff --git a/lode/api.py b/lode/api.py
@@ -59,20 +59,51 @@ class ReadAsFormat(str, Enum):
 }
 
 import time
-SPOOL_DIR = os.path.join(os.path.dirname(__file__), "spool")
+SPOOL_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), "spool"))
 os.makedirs(SPOOL_DIR, exist_ok=True)
-_SPOOL_TTL = 60 * 60
+_SPOOL_TTL = 4 * 60 * 60           # entries are cached for 4 hours
+_SPOOL_MAX_BYTES = 1024 ** 3       # 1 GB total budget shared by uploads + URLs
 
 def _spool_path(token: str) -> str:
-    return os.path.join(SPOOL_DIR, f"{token}.rdf")
+    # Spool tokens are opaque IDs we mint ourselves (uuid4 hex / "url_"+sha256).
+    # Resolve and confirm the path stays inside SPOOL_DIR, so a crafted upload_id
+    # cannot traverse out of it (path injection).
+    path = os.path.realpath(os.path.join(SPOOL_DIR, f"{token}.rdf"))
+    if os.path.commonpath((SPOOL_DIR, path)) != SPOOL_DIR:
+        raise ArtefactValidationError("Invalid upload token", context={"token": token})
+    return path
 
 def _prune_spool():
+    """Evict expired entries, then enforce the total-size budget by deleting the
+    oldest (by cache-write time) until back under the cap. Uploads and URL caches
+    share the same budget. Best-effort across workers (races caught via OSError).
+    """
     cutoff = time.time() - _SPOOL_TTL
+    survivors = []  # (mtime, size, path) of entries still within the TTL
     for name in os.listdir(SPOOL_DIR):
         p = os.path.join(SPOOL_DIR, name)
         try:
-            if os.path.getmtime(p) < cutoff:
+            st = os.stat(p)
+        except OSError:
+            continue
+        if st.st_mtime < cutoff:
+            try:
                 os.unlink(p)
+            except OSError:
+                pass
+            continue
+        survivors.append((st.st_mtime, st.st_size, p))
+
+    total = sum(size for _, size, _ in survivors)
+    if total <= _SPOOL_MAX_BYTES:
+        return
+    survivors.sort()  # oldest cache-write time first
+    for _, size, p in survivors:
+        if total <= _SPOOL_MAX_BYTES:
+            break
+        try:
+            os.unlink(p)
+            total -= size
         except OSError:
             pass
 
@@ -101,16 +132,25 @@ def _url_token(url, read_as, imported, closure) -> str:
     key = f"{url}|{read_as}|{imported}|{closure}".encode()
     return "url_" + hashlib.sha256(key).hexdigest()[:32]
 
-def _load_url(url, read_as, imported, closure, warnings):
+def _load_url(url, read_as, imported, closure, warnings, use_cache=True):
+    # Enforce http(s)://host up front: a non-URL value (local path, file://, ...)
+    # must never reach the loader and be opened as a local file.
+    security.check_url_safe(url)
     _prune_spool()
     token = _url_token(url, read_as, imported, closure)
     path = _spool_path(token)
-    if os.path.exists(path):
+    if use_cache and os.path.exists(path):
         # cache hit: ricostruisci dal Turtle salvato
         reader = Reader()
         reader.load_instances(path, read_as, imported=imported, closure=closure, warnings=warnings)
         return reader
-    # cache miss: scarica e processa dalla URL
+    if not use_cache:
+        # cache=false: drop the stale copy so the fresh fetch replaces it
+        try:
+            os.unlink(path)
+        except OSError:
+            pass
+    # cache miss (or forced refresh): scarica e processa dalla URL
     reader = Reader()
     reader.load_instances(url, read_as, imported=imported, closure=closure, warnings=warnings)
     # persisti il grafo normalizzato per i prossimi hit
@@ -121,8 +161,9 @@ def _load_url(url, read_as, imported, closure, warnings):
         pass
     return reader
 
-def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings):
+def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings, use_cache=True):
     if upload_id:
+        # Uploads are not re-fetched, so the cache flag does not apply to them.
         path = _spool_path(upload_id)
         if not os.path.exists(path):
             raise ArtefactValidationError("Upload expired, please re-upload",
@@ -131,7 +172,7 @@ def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings):
         reader.load_instances(path, read_as, imported=imported, closure=closure, warnings=warnings)
         return reader
     if url:
-        return _load_url(url, read_as, imported, closure, warnings)
+        return _load_url(url, read_as, imported, closure, warnings, use_cache=use_cache)
     raise ArtefactValidationError("Missing 'url' or 'upload_id'")
 
 # ----------------------------------------------------------
@@ -197,12 +238,13 @@ async def extract_get(
     lang: Optional[str] = None,
     imported: Optional[bool] = None,
     closure: Optional[bool] = None,
-    format: Optional[str] = None, 
-    warnings: bool = False
+    format: Optional[str] = None,
+    warnings: bool = False,
+    cache: bool = True
 ):
         _check_format_enabled(read_as)
-        
-        reader = _resolve_reader(read_as.value, url, upload_id, imported, closure, warnings)
+
+        reader = _resolve_reader(read_as.value, url, upload_id, imported, closure, warnings, use_cache=cache)
 
         # Content negotiation
         accept = request.headers.get("accept", "text/html")

diff --git a/lode/reader/loader.py b/lode/reader/loader.py
@@ -9,7 +9,7 @@
 
 import lode.reader.modules as modules
 from lode.reader import security
-from lode.exceptions import ArtefactLoadError, ArtefactNotFoundError
+from lode.exceptions import ArtefactLoadError, ArtefactNotFoundError, ArtefactValidationError
 
 
 class Loader:
@@ -33,6 +33,15 @@ def load(self, source: str) -> None:
         if self._is_url(source):
             self._load_from_url_with_content_negotiation(source)
         else:
+            # A value carrying a URL scheme that is not http(s) (file:, ftp:, ...)
+            # must not be silently treated as a local path. Bare local paths
+            # (no scheme) are still allowed here for the CLI.
+            scheme = urlparse(source).scheme
+            if scheme:
+                raise ArtefactValidationError(
+                    "URL scheme not allowed; use http(s)://host",
+                    context={"scheme": scheme},
+                )
             self._load_from_local_file(source)
 
         if len(self.graph) == 0:
@@ -191,6 +200,9 @@ def _fetch_following_redirects(self, url: str, headers: dict, max_redirects: int
             security.check_url_safe(current)
             response = requests.get(current, headers=headers, timeout=10,
                                     stream=True, allow_redirects=False)
+            # Validate the IP we ACTUALLY connected to, before reading anything:
+            # defeats DNS rebinding between check_url_safe above and this connect.
+            self._verify_peer_ip(response)
             if response.status_code in (301, 302, 303, 307, 308):
                 location = response.headers.get("Location")
                 response.close()
@@ -201,4 +213,26 @@ def _fetch_following_redirects(self, url: str, headers: dict, max_redirects: int
             return response
         raise ArtefactLoadError("Too many redirects", context={"url": url})
 
+    def _verify_peer_ip(self, response) -> None:
+        """Re-check the SSRF policy against the socket's real peer IP. If it
+        cannot be determined (e.g. mocked in tests) fall back to the per-hop
+        check_url_safe already done before connecting."""
+        ip = self._peer_ip(response)
+        if ip is None:
+            return
+        try:
+            security.check_ip_safe(ip)
+        except ArtefactValidationError:
+            response.close()
+            raise
+
+    @staticmethod
+    def _peer_ip(response):
+        """Best-effort extraction of the connected peer IP from a streamed
+        requests response (reaches into urllib3 internals, hence defensive)."""
+        try:
+            return response.raw._connection.sock.getpeername()[0]
+        except Exception:
+            return None
+
 
diff --git a/lode/reader/security.py b/lode/reader/security.py
@@ -78,6 +78,22 @@ def check_extension(name: str) -> None:
     if ext not in ALLOWED_EXTENSIONS:
         raise ArtefactValidationError("Extension not allowed", context={"ext": ext})
 
+def check_ip_safe(ip_str: str) -> None:
+    """Reject an IP that points at an internal/non-routable range (SSRF).
+
+    Reused both before connecting (each resolved IP) and after connecting (the
+    real peer IP), so a DNS rebinding between the two cannot reach an internal
+    host.
+    """
+    ip = ipaddress.ip_address(ip_str)
+    # IPv4-mapped IPv6 (e.g. ::ffff:127.0.0.1) would bypass the v4 checks below.
+    if getattr(ip, "ipv4_mapped", None):
+        ip = ip.ipv4_mapped
+    if (ip.is_private or ip.is_loopback or ip.is_link_local
+            or ip.is_reserved or ip.is_multicast or ip.is_unspecified):
+        raise ArtefactValidationError("Blocked address", context={"ip": str(ip)})
+
+
 def check_url_safe(url: str) -> None:
     """Block non-http schemes and SSRF toward private/internal hosts."""
     parsed = urlparse(url)
@@ -91,13 +107,7 @@ def check_url_safe(url: str) -> None:
     except socket.gaierror:
         raise ArtefactValidationError("Cannot resolve host", context={"host": host})
     for info in infos:
-        ip = ipaddress.ip_address(info[4][0])
-        # IPv4-mapped IPv6 (e.g. ::ffff:127.0.0.1) would bypass the v4 checks below.
-        if getattr(ip, "ipv4_mapped", None):
-            ip = ip.ipv4_mapped
-        if (ip.is_private or ip.is_loopback or ip.is_link_local
-                or ip.is_reserved or ip.is_multicast or ip.is_unspecified):
-            raise ArtefactValidationError("Blocked address", context={"host": host, "ip": str(ip)})
+        check_ip_safe(info[4][0])
 
 def check_is_text(data: bytes) -> None:
     """RDF serializations are text. Reject binary blobs."""