indico · ArmindoFlores · Apr 21, 2026 · Apr 27, 2026
diff --git a/ror/README.md b/ror/README.md
@@ -0,0 +1,2 @@
+# ROR Affiliations Plugin
+TODO
diff --git a/ror/indico_ror/__init__.py b/ror/indico_ror/__init__.py
@@ -0,0 +1,17 @@
+# This file is part of the Indico plugins.
+# Copyright (C) 2002 - 2026 CERN
+#
+# The Indico plugins are free software; you can redistribute
+# them and/or modify them under the terms of the MIT License;
+# see the LICENSE file for more details.
+
+from indico.core import signals
+from indico.util.i18n import make_bound_gettext
+
+
+_ = make_bound_gettext('ror')
+
+
+@signals.core.import_tasks.connect
+def _import_tasks(sender, **kwargs):
+    import indico_ror.task  # noqa: F401
diff --git a/ror/indico_ror/matching.py b/ror/indico_ror/matching.py
@@ -0,0 +1,131 @@
+# This file is part of the Indico plugins.
+# Copyright (C) 2002 - 2026 CERN
+#
+# The Indico plugins are free software; you can redistribute
+# them and/or modify them under the terms of the MIT License;
+# see the LICENSE file for more details.
+
+import ollama
+from langchain_ollama import OllamaEmbeddings
+from sqlalchemy import delete, literal, select, union_all
+
+from indico.core.db import db
+from indico.modules.affiliations.search import AffiliationSearchMatch, AffiliationSearchProvider
+
+from indico_ror.models.affiliation_vs_document import AffiliationVectorStoreDocument
+
+
+def is_model_pulled(model_name: str) -> bool:
+    models = ollama.list()
+    return any(model.model.split(':')[0] == model_name for model in models.models)
+
+
+def ensure_model(model_name: str):
+    """Pull a model if it's not already available."""
+    if not is_model_pulled(model_name):
+        # ollama.pull downloads from some remote archive; we should instead pre-fetch whichever
+        # models we actually need
+        ollama.pull(model_name)
+
+
+class PSQLVectorStoreAffiliationSearchProvider(AffiliationSearchProvider):
+    def __init__(
+         self, model: str = 'jina/jina-embeddings-v2-small-en', batch_size: int = 512, threshold: float = 0.3
+    ) -> None:
+        ensure_model(model)
+        self.model = model
+        self.embeddings = OllamaEmbeddings(
+            model=model,
+        )
+        self.batch_size = batch_size
+        self.threshold = threshold
+
+    @staticmethod
+    def cosine_distance_to_score(distance: float) -> float:
+        return 1 - distance
+
+    def init(self, texts: list[str], affiliation_ids: list[int]) -> list[str]:
+        return self.add(texts, affiliation_ids)
+
+    def add(self, texts: list[str], affiliation_ids: list[int]) -> None:
+        if len(texts) == 0:
+            return
+        for i in range(0, len(texts), self.batch_size):
+            embeddings = self.embeddings.embed_documents(texts[i:i+self.batch_size])
+            for j in range(len(embeddings)):
+                db.session.add(AffiliationVectorStoreDocument(
+                    content=texts[i+j],
+                    embedding=embeddings[j],
+                    affiliation_id=affiliation_ids[i+j],
+                ))
+            db.session.flush()
+        db.session.flush()
+
+    def update(self, texts: list[str], affiliation_ids: list[int], changed_affiliations: list[int]) -> None:
+        if len(texts) == 0:
+            return
+        self.delete(changed_affiliations)
+        return self.add(texts, affiliation_ids)
+
+    def delete(self, affiliation_ids: list[int]) -> None:
+        if len(affiliation_ids) == 0:
+            return
+        db.session.execute(
+            delete(AffiliationVectorStoreDocument)
+            .where(AffiliationVectorStoreDocument.affiliation_id.in_(affiliation_ids))
+        )
+
+    def match_embeddings(
+        self, embeddings: list[list[float]], k: int = 1
+    ) -> list[list[AffiliationSearchMatch]]:
+        subqueries = []
+        for i, embedding in enumerate(embeddings):
+            distance = AffiliationVectorStoreDocument.embedding.cosine_distance(embedding).label('distance')
+            subqueries.append(
+                select(
+                    literal(i).label('embedding_index'),
+                    AffiliationVectorStoreDocument.id.label('id'),
+                    distance
+                ).where(distance < self.threshold)
+                .order_by(distance)
+                .limit(k)
+            )
+        combined = union_all(*subqueries).subquery()
+
+        results = db.session.execute(
+            select(
+                combined.c.embedding_index,
+                AffiliationVectorStoreDocument,
+                combined.c.distance
+            )
+            .join(AffiliationVectorStoreDocument, AffiliationVectorStoreDocument.id == combined.c.id)
+            .order_by(combined.c.embedding_index, combined.c.distance)
+        ).all()
+
+        grouped: dict[int, list[AffiliationSearchMatch]] = {i: [] for i in range(len(embeddings))}
+        for embedding_index, doc, dist in results:
+            grouped[embedding_index].append(AffiliationSearchMatch(
+                score=self.cosine_distance_to_score(dist), text=doc.content, affiliation_id=doc.affiliation_id
+            ))
+
+        return [grouped[i] for i in sorted(grouped.keys())]
+
+    def match_embedding(self, embedding: list[float], k: int = 1) -> list[AffiliationSearchMatch]:
+        distance = AffiliationVectorStoreDocument.embedding.cosine_distance(embedding).label('distance')
+        return [
+            AffiliationSearchMatch(
+                score=self.cosine_distance_to_score(dist), text=doc.content, affiliation_id=doc.affiliation_id
+            )
+            for doc, dist in db.session.execute(
+                select(AffiliationVectorStoreDocument, distance)
+                .where(distance < self.threshold)
+                .order_by(distance)
+                .limit(k)
+            ).all()
+        ]
+
+    def match_many(self, texts: list[str], k: int = 1) -> list[list[AffiliationSearchMatch]]:
+        return self.match_embeddings([self.embeddings.embed_query(text) for text in texts], k)
+
+    def match(self, text: str, k: int = 1) -> list[AffiliationSearchMatch]:
+        return self.match_embedding(self.embeddings.embed_query(text), k)
diff --git a/ror/indico_ror/migrations/.no-header b/ror/indico_ror/migrations/.no-header
diff --git a/ror/indico_ror/migrations/20260313_1243_f95cb312d2bb_add_affiliations_vector_store.py b/ror/indico_ror/migrations/20260313_1243_f95cb312d2bb_add_affiliations_vector_store.py
@@ -0,0 +1,43 @@
+"""Add affiliations vector store.
+
+Revision ID: f95cb312d2bb
+Revises:
+Create Date: 2026-03-13 12:43:32.863241
+"""
+
+import sqlalchemy as sa
+from alembic import context, op
+from pgvector.sqlalchemy import Vector
+from sqlalchemy.sql.ddl import CreateSchema, DropSchema
+
+
+# revision identifiers, used by Alembic.
+revision = 'f95cb312d2bb'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    if not context.is_offline_mode():
+        connection = context.get_bind()
+        # Make sure the vector extension is enabled
+        result = connection.execute("SELECT oid FROM pg_extension where extname = 'vector'")
+        if result.rowcount == 0:
+            raise Exception('The pg_extension "vector" must be enabled to run this update')
+
+    op.execute(CreateSchema('plugin_ror'))
+    op.create_table('affiliation_documents',
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('content', sa.Text(), nullable=False),
+        sa.Column('embedding', Vector(dim=512), nullable=False),
+        sa.Column('affiliation_id', sa.Integer(), nullable=True),
+        sa.ForeignKeyConstraint(['affiliation_id'], ['indico.affiliations.id'], ondelete='cascade'),
+        sa.PrimaryKeyConstraint('id'),
+        schema='plugin_ror'
+    )
+
+
+def downgrade():
+    op.drop_table('affiliation_documents', schema='plugin_ror')
+    op.execute(DropSchema('plugin_ror'))
diff --git a/ror/indico_ror/models/affiliation_vs_document.py b/ror/indico_ror/models/affiliation_vs_document.py
@@ -0,0 +1,27 @@
+# This file is part of the Indico plugins.
+# Copyright (C) 2002 - 2026 CERN
+#
+# The Indico plugins are free software; you can redistribute
+# them and/or modify them under the terms of the MIT License;
+# see the LICENSE file for more details.
+
+from pgvector.sqlalchemy import Vector
+
+from indico.core.db import db
+
+
+class AffiliationVectorStoreDocument(db.Model):
+    __tablename__ = 'affiliation_documents'
+    __table_args__ = {'schema': 'plugin_ror'}
+
+    id = db.Column(db.Integer, primary_key=True)
+    content = db.Column(db.Text, nullable=False)
+    embedding = db.Column(Vector(512), nullable=False)
+    affiliation_id = db.Column(db.Integer, db.ForeignKey('indico.affiliations.id', ondelete='cascade'), nullable=True)
+    affiliation = db.relationship(
+        'Affiliation',
+        backref=db.backref('ror_vector_store_documents', cascade='all, delete-orphan', lazy='dynamic'),
+    )
+
+    def __repr__(self):
+        return f'<VectorStoreDocument(id={self.id}, content={self.content}, affiliation_id={self.affiliation_id})>'