Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ror/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# ROR Affiliations Plugin
TODO
17 changes: 17 additions & 0 deletions ror/indico_ror/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# This file is part of the Indico plugins.
# Copyright (C) 2002 - 2026 CERN
#
# The Indico plugins are free software; you can redistribute
# them and/or modify them under the terms of the MIT License;
# see the LICENSE file for more details.

from indico.core import signals
from indico.util.i18n import make_bound_gettext


_ = make_bound_gettext('ror')


@signals.core.import_tasks.connect
def _import_tasks(sender, **kwargs):
import indico_ror.task # noqa: F401
131 changes: 131 additions & 0 deletions ror/indico_ror/matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# This file is part of the Indico plugins.
# Copyright (C) 2002 - 2026 CERN
#
# The Indico plugins are free software; you can redistribute
# them and/or modify them under the terms of the MIT License;
# see the LICENSE file for more details.

import ollama
from langchain_ollama import OllamaEmbeddings
from sqlalchemy import delete, literal, select, union_all

from indico.core.db import db
from indico.modules.affiliations.search import AffiliationSearchMatch, AffiliationSearchProvider

from indico_ror.models.affiliation_vs_document import AffiliationVectorStoreDocument


def is_model_pulled(model_name: str) -> bool:
models = ollama.list()
return any(model.model.split(':')[0] == model_name for model in models.models)


def ensure_model(model_name: str):
"""Pull a model if it's not already available."""
if not is_model_pulled(model_name):
# ollama.pull downloads from some remote archive; we should instead pre-fetch whichever
# models we actually need
ollama.pull(model_name)


class PSQLVectorStoreAffiliationSearchProvider(AffiliationSearchProvider):
def __init__(
self, model: str = 'jina/jina-embeddings-v2-small-en', batch_size: int = 512, threshold: float = 0.3
) -> None:
ensure_model(model)
self.model = model
self.embeddings = OllamaEmbeddings(
model=model,
)
self.batch_size = batch_size
self.threshold = threshold

@staticmethod
def cosine_distance_to_score(distance: float) -> float:
return 1 - distance

def init(self, texts: list[str], affiliation_ids: list[int]) -> list[str]:
return self.add(texts, affiliation_ids)

def add(self, texts: list[str], affiliation_ids: list[int]) -> None:
if len(texts) == 0:
return
for i in range(0, len(texts), self.batch_size):
embeddings = self.embeddings.embed_documents(texts[i:i+self.batch_size])
for j in range(len(embeddings)):
db.session.add(AffiliationVectorStoreDocument(
content=texts[i+j],
embedding=embeddings[j],
affiliation_id=affiliation_ids[i+j],
))
db.session.flush()
db.session.flush()

def update(self, texts: list[str], affiliation_ids: list[int], changed_affiliations: list[int]) -> None:
if len(texts) == 0:
return
self.delete(changed_affiliations)
return self.add(texts, affiliation_ids)

def delete(self, affiliation_ids: list[int]) -> None:
if len(affiliation_ids) == 0:
return
db.session.execute(
delete(AffiliationVectorStoreDocument)
.where(AffiliationVectorStoreDocument.affiliation_id.in_(affiliation_ids))
)

def match_embeddings(
self, embeddings: list[list[float]], k: int = 1
) -> list[list[AffiliationSearchMatch]]:
subqueries = []
for i, embedding in enumerate(embeddings):
distance = AffiliationVectorStoreDocument.embedding.cosine_distance(embedding).label('distance')
subqueries.append(
select(
literal(i).label('embedding_index'),
AffiliationVectorStoreDocument.id.label('id'),
distance
).where(distance < self.threshold)
.order_by(distance)
.limit(k)
)
combined = union_all(*subqueries).subquery()

results = db.session.execute(
select(
combined.c.embedding_index,
AffiliationVectorStoreDocument,
combined.c.distance
)
.join(AffiliationVectorStoreDocument, AffiliationVectorStoreDocument.id == combined.c.id)
.order_by(combined.c.embedding_index, combined.c.distance)
).all()

grouped: dict[int, list[AffiliationSearchMatch]] = {i: [] for i in range(len(embeddings))}
for embedding_index, doc, dist in results:
grouped[embedding_index].append(AffiliationSearchMatch(
score=self.cosine_distance_to_score(dist), text=doc.content, affiliation_id=doc.affiliation_id
))

return [grouped[i] for i in sorted(grouped.keys())]

def match_embedding(self, embedding: list[float], k: int = 1) -> list[AffiliationSearchMatch]:
distance = AffiliationVectorStoreDocument.embedding.cosine_distance(embedding).label('distance')
return [
AffiliationSearchMatch(
score=self.cosine_distance_to_score(dist), text=doc.content, affiliation_id=doc.affiliation_id
)
for doc, dist in db.session.execute(
select(AffiliationVectorStoreDocument, distance)
.where(distance < self.threshold)
.order_by(distance)
.limit(k)
).all()
]

def match_many(self, texts: list[str], k: int = 1) -> list[list[AffiliationSearchMatch]]:
return self.match_embeddings([self.embeddings.embed_query(text) for text in texts], k)

def match(self, text: str, k: int = 1) -> list[AffiliationSearchMatch]:
return self.match_embedding(self.embeddings.embed_query(text), k)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Add affiliations vector store.

Revision ID: f95cb312d2bb
Revises:
Create Date: 2026-03-13 12:43:32.863241
"""

import sqlalchemy as sa
from alembic import context, op
from pgvector.sqlalchemy import Vector
from sqlalchemy.sql.ddl import CreateSchema, DropSchema


# revision identifiers, used by Alembic.
revision = 'f95cb312d2bb'
down_revision = None
branch_labels = None
depends_on = None


def upgrade():
if not context.is_offline_mode():
connection = context.get_bind()
# Make sure the vector extension is enabled
result = connection.execute("SELECT oid FROM pg_extension where extname = 'vector'")
if result.rowcount == 0:
raise Exception('The pg_extension "vector" must be enabled to run this update')

op.execute(CreateSchema('plugin_ror'))
op.create_table('affiliation_documents',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('content', sa.Text(), nullable=False),
sa.Column('embedding', Vector(dim=512), nullable=False),
sa.Column('affiliation_id', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['affiliation_id'], ['indico.affiliations.id'], ondelete='cascade'),
sa.PrimaryKeyConstraint('id'),
schema='plugin_ror'
)


def downgrade():
op.drop_table('affiliation_documents', schema='plugin_ror')
op.execute(DropSchema('plugin_ror'))
27 changes: 27 additions & 0 deletions ror/indico_ror/models/affiliation_vs_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# This file is part of the Indico plugins.
# Copyright (C) 2002 - 2026 CERN
#
# The Indico plugins are free software; you can redistribute
# them and/or modify them under the terms of the MIT License;
# see the LICENSE file for more details.

from pgvector.sqlalchemy import Vector

from indico.core.db import db


class AffiliationVectorStoreDocument(db.Model):
__tablename__ = 'affiliation_documents'
__table_args__ = {'schema': 'plugin_ror'}

id = db.Column(db.Integer, primary_key=True)
content = db.Column(db.Text, nullable=False)
embedding = db.Column(Vector(512), nullable=False)
affiliation_id = db.Column(db.Integer, db.ForeignKey('indico.affiliations.id', ondelete='cascade'), nullable=True)
affiliation = db.relationship(
'Affiliation',
backref=db.backref('ror_vector_store_documents', cascade='all, delete-orphan', lazy='dynamic'),
)

def __repr__(self):
return f'<VectorStoreDocument(id={self.id}, content={self.content}, affiliation_id={self.affiliation_id})>'
Loading