diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..11ebf31 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,29 @@ +# Python caches / build artifacts +__pycache__/ +*.py[cod] +*.so +build/ +dist/ +*.egg-info/ + +# Virtual environments — must NOT clobber the image-built venv +.venv/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# Runtime spool cache +**/spool/ + +# VCS / CI / IDE +.git/ +.github/ +.vscode/ +.idea/ + +# Docs site build artifacts +docs/node_modules/ +docs/.astro/ +docs/dist/ diff --git a/Dockerfile b/Dockerfile index 84d18b4..af5ab1d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,41 +1,44 @@ # Base image: Python slim for a lightweight container FROM python:3.11-slim -# Define environment variables with default values -# These can be overridden during container runtime +# Environment defaults (overridable at runtime) ENV BASE_URL="lode.opencitations.net" - -# Ensure Python output is unbuffered ENV PYTHONUNBUFFERED=1 -# Install system dependencies + uv +# System dependencies + uv (installed to a system path so a non-root user can use it) RUN apt-get update && \ - apt-get install -y \ - git \ - python3-dev \ - build-essential \ - curl && \ - curl -LsSf https://astral.sh/uv/install.sh | sh && \ + apt-get install -y --no-install-recommends \ + git \ + python3-dev \ + build-essential \ + curl && \ + curl -LsSf https://astral.sh/uv/install.sh | \ + env UV_INSTALL_DIR=/usr/local/bin INSTALLER_NO_MODIFY_PATH=1 sh && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Make uv available in PATH -ENV PATH="/root/.local/bin:$PATH" +# Non-root user that owns the app and runs the server +RUN useradd --create-home --uid 10001 appuser -# Set the working directory for our application WORKDIR /website +RUN chown appuser:appuser /website -# Copy dependency files first for better Docker layer caching -COPY pyproject.toml uv.lock README.md ./ +# Drop privileges before building and running: the venv and the runtime spool +# directory end up owned by appuser, so no root is needed at any point. +USER appuser -# Install dependencies (frozen = use exact lockfile versions) +# Install dependencies first for better layer caching (frozen = exact lockfile) +COPY --chown=appuser:appuser pyproject.toml uv.lock README.md ./ RUN uv sync --frozen --no-dev --no-install-project -# Copy application code -COPY . . +# Application code +COPY --chown=appuser:appuser . . + +# At runtime uv must only launch the entrypoint, never re-sync the prebuilt env +ENV UV_NO_SYNC=1 -# Expose the port that our service will listen on +# Service port (>1024, bindable by a non-root user) EXPOSE 8080 # Start the application with gunicorn via uv -CMD ["uv", "run", "gunicorn", "-c", "gunicorn.conf.py", "lode.api:app"] \ No newline at end of file +CMD ["uv", "run", "gunicorn", "-c", "gunicorn.conf.py", "lode.api:app"] diff --git a/lode/api.py b/lode/api.py index fb9768e..431b7ee 100644 --- a/lode/api.py +++ b/lode/api.py @@ -59,20 +59,51 @@ class ReadAsFormat(str, Enum): } import time -SPOOL_DIR = os.path.join(os.path.dirname(__file__), "spool") +SPOOL_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), "spool")) os.makedirs(SPOOL_DIR, exist_ok=True) -_SPOOL_TTL = 60 * 60 +_SPOOL_TTL = 4 * 60 * 60 # entries are cached for 4 hours +_SPOOL_MAX_BYTES = 1024 ** 3 # 1 GB total budget shared by uploads + URLs def _spool_path(token: str) -> str: - return os.path.join(SPOOL_DIR, f"{token}.rdf") + # Spool tokens are opaque IDs we mint ourselves (uuid4 hex / "url_"+sha256). + # Resolve and confirm the path stays inside SPOOL_DIR, so a crafted upload_id + # cannot traverse out of it (path injection). + path = os.path.realpath(os.path.join(SPOOL_DIR, f"{token}.rdf")) + if os.path.commonpath((SPOOL_DIR, path)) != SPOOL_DIR: + raise ArtefactValidationError("Invalid upload token", context={"token": token}) + return path def _prune_spool(): + """Evict expired entries, then enforce the total-size budget by deleting the + oldest (by cache-write time) until back under the cap. Uploads and URL caches + share the same budget. Best-effort across workers (races caught via OSError). + """ cutoff = time.time() - _SPOOL_TTL + survivors = [] # (mtime, size, path) of entries still within the TTL for name in os.listdir(SPOOL_DIR): p = os.path.join(SPOOL_DIR, name) try: - if os.path.getmtime(p) < cutoff: + st = os.stat(p) + except OSError: + continue + if st.st_mtime < cutoff: + try: os.unlink(p) + except OSError: + pass + continue + survivors.append((st.st_mtime, st.st_size, p)) + + total = sum(size for _, size, _ in survivors) + if total <= _SPOOL_MAX_BYTES: + return + survivors.sort() # oldest cache-write time first + for _, size, p in survivors: + if total <= _SPOOL_MAX_BYTES: + break + try: + os.unlink(p) + total -= size except OSError: pass @@ -101,16 +132,25 @@ def _url_token(url, read_as, imported, closure) -> str: key = f"{url}|{read_as}|{imported}|{closure}".encode() return "url_" + hashlib.sha256(key).hexdigest()[:32] -def _load_url(url, read_as, imported, closure, warnings): +def _load_url(url, read_as, imported, closure, warnings, use_cache=True): + # Enforce http(s)://host up front: a non-URL value (local path, file://, ...) + # must never reach the loader and be opened as a local file. + security.check_url_safe(url) _prune_spool() token = _url_token(url, read_as, imported, closure) path = _spool_path(token) - if os.path.exists(path): + if use_cache and os.path.exists(path): # cache hit: ricostruisci dal Turtle salvato reader = Reader() reader.load_instances(path, read_as, imported=imported, closure=closure, warnings=warnings) return reader - # cache miss: scarica e processa dalla URL + if not use_cache: + # cache=false: drop the stale copy so the fresh fetch replaces it + try: + os.unlink(path) + except OSError: + pass + # cache miss (or forced refresh): scarica e processa dalla URL reader = Reader() reader.load_instances(url, read_as, imported=imported, closure=closure, warnings=warnings) # persisti il grafo normalizzato per i prossimi hit @@ -121,8 +161,9 @@ def _load_url(url, read_as, imported, closure, warnings): pass return reader -def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings): +def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings, use_cache=True): if upload_id: + # Uploads are not re-fetched, so the cache flag does not apply to them. path = _spool_path(upload_id) if not os.path.exists(path): raise ArtefactValidationError("Upload expired, please re-upload", @@ -131,7 +172,7 @@ def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings): reader.load_instances(path, read_as, imported=imported, closure=closure, warnings=warnings) return reader if url: - return _load_url(url, read_as, imported, closure, warnings) + return _load_url(url, read_as, imported, closure, warnings, use_cache=use_cache) raise ArtefactValidationError("Missing 'url' or 'upload_id'") # ---------------------------------------------------------- @@ -197,12 +238,13 @@ async def extract_get( lang: Optional[str] = None, imported: Optional[bool] = None, closure: Optional[bool] = None, - format: Optional[str] = None, - warnings: bool = False + format: Optional[str] = None, + warnings: bool = False, + cache: bool = True ): _check_format_enabled(read_as) - - reader = _resolve_reader(read_as.value, url, upload_id, imported, closure, warnings) + + reader = _resolve_reader(read_as.value, url, upload_id, imported, closure, warnings, use_cache=cache) # Content negotiation accept = request.headers.get("accept", "text/html") diff --git a/lode/reader/loader.py b/lode/reader/loader.py index 230abfc..0d250fa 100644 --- a/lode/reader/loader.py +++ b/lode/reader/loader.py @@ -9,7 +9,7 @@ import lode.reader.modules as modules from lode.reader import security -from lode.exceptions import ArtefactLoadError, ArtefactNotFoundError +from lode.exceptions import ArtefactLoadError, ArtefactNotFoundError, ArtefactValidationError class Loader: @@ -33,6 +33,15 @@ def load(self, source: str) -> None: if self._is_url(source): self._load_from_url_with_content_negotiation(source) else: + # A value carrying a URL scheme that is not http(s) (file:, ftp:, ...) + # must not be silently treated as a local path. Bare local paths + # (no scheme) are still allowed here for the CLI. + scheme = urlparse(source).scheme + if scheme: + raise ArtefactValidationError( + "URL scheme not allowed; use http(s)://host", + context={"scheme": scheme}, + ) self._load_from_local_file(source) if len(self.graph) == 0: @@ -191,6 +200,9 @@ def _fetch_following_redirects(self, url: str, headers: dict, max_redirects: int security.check_url_safe(current) response = requests.get(current, headers=headers, timeout=10, stream=True, allow_redirects=False) + # Validate the IP we ACTUALLY connected to, before reading anything: + # defeats DNS rebinding between check_url_safe above and this connect. + self._verify_peer_ip(response) if response.status_code in (301, 302, 303, 307, 308): location = response.headers.get("Location") response.close() @@ -201,4 +213,26 @@ def _fetch_following_redirects(self, url: str, headers: dict, max_redirects: int return response raise ArtefactLoadError("Too many redirects", context={"url": url}) + def _verify_peer_ip(self, response) -> None: + """Re-check the SSRF policy against the socket's real peer IP. If it + cannot be determined (e.g. mocked in tests) fall back to the per-hop + check_url_safe already done before connecting.""" + ip = self._peer_ip(response) + if ip is None: + return + try: + security.check_ip_safe(ip) + except ArtefactValidationError: + response.close() + raise + + @staticmethod + def _peer_ip(response): + """Best-effort extraction of the connected peer IP from a streamed + requests response (reaches into urllib3 internals, hence defensive).""" + try: + return response.raw._connection.sock.getpeername()[0] + except Exception: + return None + diff --git a/lode/reader/security.py b/lode/reader/security.py index d677737..cf7067e 100644 --- a/lode/reader/security.py +++ b/lode/reader/security.py @@ -78,6 +78,22 @@ def check_extension(name: str) -> None: if ext not in ALLOWED_EXTENSIONS: raise ArtefactValidationError("Extension not allowed", context={"ext": ext}) +def check_ip_safe(ip_str: str) -> None: + """Reject an IP that points at an internal/non-routable range (SSRF). + + Reused both before connecting (each resolved IP) and after connecting (the + real peer IP), so a DNS rebinding between the two cannot reach an internal + host. + """ + ip = ipaddress.ip_address(ip_str) + # IPv4-mapped IPv6 (e.g. ::ffff:127.0.0.1) would bypass the v4 checks below. + if getattr(ip, "ipv4_mapped", None): + ip = ip.ipv4_mapped + if (ip.is_private or ip.is_loopback or ip.is_link_local + or ip.is_reserved or ip.is_multicast or ip.is_unspecified): + raise ArtefactValidationError("Blocked address", context={"ip": str(ip)}) + + def check_url_safe(url: str) -> None: """Block non-http schemes and SSRF toward private/internal hosts.""" parsed = urlparse(url) @@ -91,13 +107,7 @@ def check_url_safe(url: str) -> None: except socket.gaierror: raise ArtefactValidationError("Cannot resolve host", context={"host": host}) for info in infos: - ip = ipaddress.ip_address(info[4][0]) - # IPv4-mapped IPv6 (e.g. ::ffff:127.0.0.1) would bypass the v4 checks below. - if getattr(ip, "ipv4_mapped", None): - ip = ip.ipv4_mapped - if (ip.is_private or ip.is_loopback or ip.is_link_local - or ip.is_reserved or ip.is_multicast or ip.is_unspecified): - raise ArtefactValidationError("Blocked address", context={"host": host, "ip": str(ip)}) + check_ip_safe(info[4][0]) def check_is_text(data: bytes) -> None: """RDF serializations are text. Reject binary blobs.""" diff --git a/lode/templates/index.html b/lode/templates/index.html index 0bc0b39..f9ac170 100644 --- a/lode/templates/index.html +++ b/lode/templates/index.html @@ -18,7 +18,8 @@