Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Python caches / build artifacts
__pycache__/
*.py[cod]
*.so
build/
dist/
*.egg-info/

# Virtual environments — must NOT clobber the image-built venv
.venv/

# Testing
.pytest_cache/
.coverage
htmlcov/

# Runtime spool cache
**/spool/

# VCS / CI / IDE
.git/
.github/
.vscode/
.idea/

# Docs site build artifacts
docs/node_modules/
docs/.astro/
docs/dist/
45 changes: 24 additions & 21 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,41 +1,44 @@
# Base image: Python slim for a lightweight container
FROM python:3.11-slim

# Define environment variables with default values
# These can be overridden during container runtime
# Environment defaults (overridable at runtime)
ENV BASE_URL="lode.opencitations.net"

# Ensure Python output is unbuffered
ENV PYTHONUNBUFFERED=1

# Install system dependencies + uv
# System dependencies + uv (installed to a system path so a non-root user can use it)
RUN apt-get update && \
apt-get install -y \
git \
python3-dev \
build-essential \
curl && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \
apt-get install -y --no-install-recommends \
git \
python3-dev \
build-essential \
curl && \
curl -LsSf https://astral.sh/uv/install.sh | \
env UV_INSTALL_DIR=/usr/local/bin INSTALLER_NO_MODIFY_PATH=1 sh && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Make uv available in PATH
ENV PATH="/root/.local/bin:$PATH"
# Non-root user that owns the app and runs the server
RUN useradd --create-home --uid 10001 appuser

# Set the working directory for our application
WORKDIR /website
RUN chown appuser:appuser /website

# Copy dependency files first for better Docker layer caching
COPY pyproject.toml uv.lock README.md ./
# Drop privileges before building and running: the venv and the runtime spool
# directory end up owned by appuser, so no root is needed at any point.
USER appuser

# Install dependencies (frozen = use exact lockfile versions)
# Install dependencies first for better layer caching (frozen = exact lockfile)
COPY --chown=appuser:appuser pyproject.toml uv.lock README.md ./
RUN uv sync --frozen --no-dev --no-install-project

# Copy application code
COPY . .
# Application code
COPY --chown=appuser:appuser . .

# At runtime uv must only launch the entrypoint, never re-sync the prebuilt env
ENV UV_NO_SYNC=1

# Expose the port that our service will listen on
# Service port (>1024, bindable by a non-root user)
EXPOSE 8080

# Start the application with gunicorn via uv
CMD ["uv", "run", "gunicorn", "-c", "gunicorn.conf.py", "lode.api:app"]
CMD ["uv", "run", "gunicorn", "-c", "gunicorn.conf.py", "lode.api:app"]
68 changes: 55 additions & 13 deletions lode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,51 @@ class ReadAsFormat(str, Enum):
}

import time
SPOOL_DIR = os.path.join(os.path.dirname(__file__), "spool")
SPOOL_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), "spool"))
os.makedirs(SPOOL_DIR, exist_ok=True)
_SPOOL_TTL = 60 * 60
_SPOOL_TTL = 4 * 60 * 60 # entries are cached for 4 hours
_SPOOL_MAX_BYTES = 1024 ** 3 # 1 GB total budget shared by uploads + URLs

def _spool_path(token: str) -> str:
return os.path.join(SPOOL_DIR, f"{token}.rdf")
# Spool tokens are opaque IDs we mint ourselves (uuid4 hex / "url_"+sha256).
# Resolve and confirm the path stays inside SPOOL_DIR, so a crafted upload_id
# cannot traverse out of it (path injection).
path = os.path.realpath(os.path.join(SPOOL_DIR, f"{token}.rdf"))
if os.path.commonpath((SPOOL_DIR, path)) != SPOOL_DIR:
raise ArtefactValidationError("Invalid upload token", context={"token": token})
return path

def _prune_spool():
"""Evict expired entries, then enforce the total-size budget by deleting the
oldest (by cache-write time) until back under the cap. Uploads and URL caches
share the same budget. Best-effort across workers (races caught via OSError).
"""
cutoff = time.time() - _SPOOL_TTL
survivors = [] # (mtime, size, path) of entries still within the TTL
for name in os.listdir(SPOOL_DIR):
p = os.path.join(SPOOL_DIR, name)
try:
if os.path.getmtime(p) < cutoff:
st = os.stat(p)
except OSError:
continue
if st.st_mtime < cutoff:
try:
os.unlink(p)
except OSError:
pass
continue
survivors.append((st.st_mtime, st.st_size, p))

total = sum(size for _, size, _ in survivors)
if total <= _SPOOL_MAX_BYTES:
return
survivors.sort() # oldest cache-write time first
for _, size, p in survivors:
if total <= _SPOOL_MAX_BYTES:
break
try:
os.unlink(p)
total -= size
except OSError:
pass

Expand Down Expand Up @@ -101,16 +132,25 @@ def _url_token(url, read_as, imported, closure) -> str:
key = f"{url}|{read_as}|{imported}|{closure}".encode()
return "url_" + hashlib.sha256(key).hexdigest()[:32]

def _load_url(url, read_as, imported, closure, warnings):
def _load_url(url, read_as, imported, closure, warnings, use_cache=True):
# Enforce http(s)://host up front: a non-URL value (local path, file://, ...)
# must never reach the loader and be opened as a local file.
security.check_url_safe(url)
_prune_spool()
token = _url_token(url, read_as, imported, closure)
path = _spool_path(token)
if os.path.exists(path):
if use_cache and os.path.exists(path):
# cache hit: ricostruisci dal Turtle salvato
reader = Reader()
reader.load_instances(path, read_as, imported=imported, closure=closure, warnings=warnings)
return reader
# cache miss: scarica e processa dalla URL
if not use_cache:
# cache=false: drop the stale copy so the fresh fetch replaces it
try:
os.unlink(path)
except OSError:
pass
# cache miss (or forced refresh): scarica e processa dalla URL
reader = Reader()
reader.load_instances(url, read_as, imported=imported, closure=closure, warnings=warnings)
# persisti il grafo normalizzato per i prossimi hit
Expand All @@ -121,8 +161,9 @@ def _load_url(url, read_as, imported, closure, warnings):
pass
return reader

def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings):
def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings, use_cache=True):
if upload_id:
# Uploads are not re-fetched, so the cache flag does not apply to them.
path = _spool_path(upload_id)
if not os.path.exists(path):
raise ArtefactValidationError("Upload expired, please re-upload",
Expand All @@ -131,7 +172,7 @@ def _resolve_reader(read_as: str, url, upload_id, imported, closure, warnings):
reader.load_instances(path, read_as, imported=imported, closure=closure, warnings=warnings)
return reader
if url:
return _load_url(url, read_as, imported, closure, warnings)
return _load_url(url, read_as, imported, closure, warnings, use_cache=use_cache)
raise ArtefactValidationError("Missing 'url' or 'upload_id'")

# ----------------------------------------------------------
Expand Down Expand Up @@ -197,12 +238,13 @@ async def extract_get(
lang: Optional[str] = None,
imported: Optional[bool] = None,
closure: Optional[bool] = None,
format: Optional[str] = None,
warnings: bool = False
format: Optional[str] = None,
warnings: bool = False,
cache: bool = True
):
_check_format_enabled(read_as)
reader = _resolve_reader(read_as.value, url, upload_id, imported, closure, warnings)

reader = _resolve_reader(read_as.value, url, upload_id, imported, closure, warnings, use_cache=cache)

# Content negotiation
accept = request.headers.get("accept", "text/html")
Expand Down
36 changes: 35 additions & 1 deletion lode/reader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import lode.reader.modules as modules
from lode.reader import security
from lode.exceptions import ArtefactLoadError, ArtefactNotFoundError
from lode.exceptions import ArtefactLoadError, ArtefactNotFoundError, ArtefactValidationError


class Loader:
Expand All @@ -33,6 +33,15 @@ def load(self, source: str) -> None:
if self._is_url(source):
self._load_from_url_with_content_negotiation(source)
else:
# A value carrying a URL scheme that is not http(s) (file:, ftp:, ...)
# must not be silently treated as a local path. Bare local paths
# (no scheme) are still allowed here for the CLI.
scheme = urlparse(source).scheme
if scheme:
raise ArtefactValidationError(
"URL scheme not allowed; use http(s)://host",
context={"scheme": scheme},
)
self._load_from_local_file(source)

if len(self.graph) == 0:
Expand Down Expand Up @@ -191,6 +200,9 @@ def _fetch_following_redirects(self, url: str, headers: dict, max_redirects: int
security.check_url_safe(current)
response = requests.get(current, headers=headers, timeout=10,
stream=True, allow_redirects=False)
# Validate the IP we ACTUALLY connected to, before reading anything:
# defeats DNS rebinding between check_url_safe above and this connect.
self._verify_peer_ip(response)
if response.status_code in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
response.close()
Expand All @@ -201,4 +213,26 @@ def _fetch_following_redirects(self, url: str, headers: dict, max_redirects: int
return response
raise ArtefactLoadError("Too many redirects", context={"url": url})

def _verify_peer_ip(self, response) -> None:
"""Re-check the SSRF policy against the socket's real peer IP. If it
cannot be determined (e.g. mocked in tests) fall back to the per-hop
check_url_safe already done before connecting."""
ip = self._peer_ip(response)
if ip is None:
return
try:
security.check_ip_safe(ip)
except ArtefactValidationError:
response.close()
raise

@staticmethod
def _peer_ip(response):
"""Best-effort extraction of the connected peer IP from a streamed
requests response (reaches into urllib3 internals, hence defensive)."""
try:
return response.raw._connection.sock.getpeername()[0]
except Exception:
return None


24 changes: 17 additions & 7 deletions lode/reader/security.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,22 @@ def check_extension(name: str) -> None:
if ext not in ALLOWED_EXTENSIONS:
raise ArtefactValidationError("Extension not allowed", context={"ext": ext})

def check_ip_safe(ip_str: str) -> None:
"""Reject an IP that points at an internal/non-routable range (SSRF).

Reused both before connecting (each resolved IP) and after connecting (the
real peer IP), so a DNS rebinding between the two cannot reach an internal
host.
"""
ip = ipaddress.ip_address(ip_str)
# IPv4-mapped IPv6 (e.g. ::ffff:127.0.0.1) would bypass the v4 checks below.
if getattr(ip, "ipv4_mapped", None):
ip = ip.ipv4_mapped
if (ip.is_private or ip.is_loopback or ip.is_link_local
or ip.is_reserved or ip.is_multicast or ip.is_unspecified):
raise ArtefactValidationError("Blocked address", context={"ip": str(ip)})


def check_url_safe(url: str) -> None:
"""Block non-http schemes and SSRF toward private/internal hosts."""
parsed = urlparse(url)
Expand All @@ -91,13 +107,7 @@ def check_url_safe(url: str) -> None:
except socket.gaierror:
raise ArtefactValidationError("Cannot resolve host", context={"host": host})
for info in infos:
ip = ipaddress.ip_address(info[4][0])
# IPv4-mapped IPv6 (e.g. ::ffff:127.0.0.1) would bypass the v4 checks below.
if getattr(ip, "ipv4_mapped", None):
ip = ip.ipv4_mapped
if (ip.is_private or ip.is_loopback or ip.is_link_local
or ip.is_reserved or ip.is_multicast or ip.is_unspecified):
raise ArtefactValidationError("Blocked address", context={"host": host, "ip": str(ip)})
check_ip_safe(info[4][0])

def check_is_text(data: bytes) -> None:
"""RDF serializations are text. Reject binary blobs."""
Expand Down
Loading
Loading