Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/rard/research/migrations/0056_faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
from rard.utils.text_processors import make_plain_text


def noop(_apps, _schema_editor):
return


def save_objects_with_plain_text_fields(apps, schema_editor):
db_alias = schema_editor.connection.alias
Antiquarian = apps.get_model("research", "Antiquarian")
Expand Down Expand Up @@ -105,5 +109,8 @@ class Migration(migrations.Migration):
name='plain_introduction',
field=models.TextField(default=''),
),
migrations.RunPython(save_objects_with_plain_text_fields),
migrations.RunPython(
code=save_objects_with_plain_text_fields,
reverse_code=noop,
),
]
47 changes: 47 additions & 0 deletions src/rard/research/migrations/0076_add_folded_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Generated by Django 3.2 on 2026-05-08 12:07

from django.db import migrations, models
from rard.utils.text_processors import fold_latin_and_remove_punctuation

def noop(_apps, _schema_editor):
return


def add_folded_text_fields(apps, schema_editor):
db_alias = schema_editor.connection.alias
OriginalText = apps.get_model("research", "OriginalText")
for object in OriginalText.objects.using(db_alias).all():
if object.plain_content:
object.folded_content = fold_latin_and_remove_punctuation(object.plain_content)
object.save()
HistoricalOriginalText = apps.get_model(
'research', 'HistoricalOriginalText'
)
for object in HistoricalOriginalText.objects.using(db_alias).all():
if object.plain_content:
object.folded_content = fold_latin_and_remove_punctuation(object.plain_content)
object.save()


class Migration(migrations.Migration):

dependencies = [
('research', '0075_add_testimonium_tags'),
]

operations = [
migrations.AddField(
model_name='historicaloriginaltext',
name='folded_content',
field=models.TextField(default=''),
),
migrations.AddField(
model_name='originaltext',
name='folded_content',
field=models.TextField(default=''),
),
migrations.RunPython(
code=add_folded_text_fields,
reverse_code=noop,
),
]
12 changes: 11 additions & 1 deletion src/rard/research/models/original_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
from rard.research.models.mixins import HistoryModelMixin
from rard.research.models.reference import Reference
from rard.utils.basemodel import BaseModel, DynamicTextField
from rard.utils.text_processors import make_plain_text
from rard.utils.text_processors import (
make_plain_text,
fold_latin_and_remove_punctuation,
)


class OriginalText(HistoryModelMixin, BaseModel):
Expand Down Expand Up @@ -53,6 +56,9 @@ def reference_list(self):
# Also store copy without html or punctuation for search purposes
plain_content = models.TextField(blank=False, default="")

# Also store a copy with all folds applied
folded_content = models.TextField(blank=False, default="")

# to be nuked eventually. not required now but hidden from view
# to preserve previous values in case our data migration is insufficient
apparatus_criticus = DynamicTextField(default="", blank=True)
Expand All @@ -71,6 +77,10 @@ def save(self, *args, **kwargs):
of list items don't get merged (and other things like that)"""
if self.content:
self.plain_content = make_plain_text(self.content)
self.folded_content = fold_latin_and_remove_punctuation(self.plain_content)
uf = kwargs.get("update_fields")
if uf is not None and "content" in uf:
kwargs["update_fields"] = {"plain_content", "folded_content"}.union(uf)
super(OriginalText, self).save(*args, **kwargs)

def apparatus_criticus_lines(self):
Expand Down
5 changes: 0 additions & 5 deletions src/rard/research/tests/views/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,6 @@ def do_search(search_function, keywords):
self.assertEqual(do_search(view.fragment_search, "notme"), [f2])
self.assertEqual(do_search(view.fragment_search, "No!TMe"), [f2])
self.assertEqual(do_search(view.fragment_search, "*Me*"), [f1, f2])
self.assertEqual(do_search(view.fragment_search, "may"), [f1, f2])
self.assertEqual(
do_search(view.fragment_search, "m!£$%^&()_+-=|\\{[}];@'#<,>./ay"),
[f1, f2],
)
self.assertEqual(do_search(view.fragment_search, "mav"), [])
self.assertEqual(do_search(view.fragment_search, 'alcott "louisa may"'), [f1])
self.assertEqual(do_search(view.fragment_search, 'may "louisa alcott"'), [])
Expand Down
151 changes: 34 additions & 117 deletions src/rard/research/views/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import partial
from itertools import chain
from string import punctuation
from collections.abc import Iterable

from django.conf import settings
from django.contrib.auth.mixins import LoginRequiredMixin
Expand All @@ -24,46 +25,8 @@
Topic,
Work,
)
from rard.utils.text_processors import fold_latin

# Fold [X,Y] transforms all instances of Y into X before matching
# Folds are applied in the specified order, so we don't need
# 'uul' <- 'vul' if we already have 'u' <- 'v'
rard_folds = [
["ast", "a est"],
["ost", "o est"],
["umst", "um est"],
["am", "an"],
["ausa", "aussa"],
["nn", "bn"],
["tt", "bt"],
["pp", "bp"],
["rr", "br"],
["ch", "cch"],
["clu", "culu"],
["claud", "clod"],
["has", "hasce"],
["his", "hisce"],
["hos", "hosce"],
["i", "ii"],
["i", "j"],
["um", "im"],
["lagr", "lagl"],
["mb", "nb"],
["ll", "nl"],
["mm", "nm"],
["mp", "np"],
["mp", "ndup"],
["rr", "nr"],
["um", "om"],
["u", "v"],
["u", "y"],
["uu", "w"],
["ulc", "ulch"],
["uul", "uol"],
["ui", "uui"],
["uum", "uom"],
["x", "xs"],
]

WILDCARD_SINGLE_CHAR = settings.WILDCARD_SINGLE_CHAR
WILDCARD_MANY_CHAR = settings.WILDCARD_MANY_CHAR
Expand Down Expand Up @@ -99,54 +62,33 @@ class Term:
"""

def __init__(self, keywords):
self.cleaned_number = 1
self.folded_number = 1
# Remove all punctuation except wildcard characers
self.keywords = PUNCTUATION_RE.sub("", keywords).lower()

# Using regex for everything doesn't seem to have a big impact
# But replace this line with the alternative code if you want to
# only use regex for search terms containing wildcards
self.lookup = "regex"
self.lookup = "iregex"
# # If wildcard characters appear in keywords, use regex lookup
# if any([char in self.keywords for char in CTRL_CHARS]):
# self.lookup = "regex"
# if any([char in keywords for char in CTRL_CHARS]):
# self.lookup = "iregex"
# else:
# self.lookup = "contains"
# self.lookup = "icontains"

# The basic function query function will first eliminate html less than
# and greater than character codes, then punctuation,
# and lowercase the 'haystack' strings to be searched.
self.basic_query = lambda q: Lower(
Func(
Func(
q,
Value("&[gl]t;"),
Value(""),
Value("g"),
function="regexp_replace",
),
Value(PUNCTUATION),
Value(""),
function="translate",
)
)
self.query = self.basic_query
# Now we call add_fold repeatedly to add more
# folds to self.query
k = self.keywords
for fold_to, fold_from in rard_folds:
if fold_from in k:
k = k.replace(fold_from, fold_to)
self.add_fold(fold_from, fold_to)
elif fold_to in k:
self.add_fold(fold_from, fold_to)
self.folded_keywords = k
self.folded_matcher = self.get_matcher(k)
# Remove all punctuation except wildcard characers
keyword_string = PUNCTUATION_RE.sub("", keywords).lower()
self.keywords = self.get_keywords(keyword_string)

self.folded_keywords = [
fold_latin(keyword)
for keyword in self.keywords
]

if self.lookup.endswith("regex"):
self.keywords = self.transform_keywords_to_regex(self.keywords)
self.folded_keywords = self.transform_keywords_to_regex(self.folded_keywords)

self.folded_matcher = self.get_matcher(self.folded_keywords)
self.nonfolded_matcher = self.get_matcher(self.keywords)

def get_matcher(self, keywords):
keyword_list = self.get_keywords(keywords)
def get_matcher(self, keyword_list: Iterable[str]):
if len(keyword_list) == 0:
# want a keyword that will always succeed
first_keyword = ""
Expand All @@ -164,12 +106,6 @@ def matcher(field):
def add_keyword(self, old, keyword):
return lambda f: Q(**{f: keyword}) & old(f)

def add_fold(self, fold_from, fold_to):
old = self.query
self.query = lambda q: Func(
old(q), Value(fold_from), Value(fold_to), function="replace"
)

def get_keywords(self, search_string):
"""
Turns a string into a series of keywords. This is mostly splitting
Expand All @@ -184,12 +120,10 @@ def get_keywords(self, search_string):
2. Captures everything inside double quotes
3. Captures individual words
"""
# regex 1st alternative matches proximity wil
# regex 1st alternative matches proximity, 2nd quoted phrase, 3rd word
keywords = re.findall(
r"(.+\s~\d?:?\d?\s.+|(?<=\")[^\"]*(?=\")|[^\s\"]+)", search_string
r"(.+\s~\d*:?\d*\s.+|(?<=\")[^\"]*(?=\")|[^\s\"]+)", search_string
)
if self.lookup == "regex":
keywords = self.transform_keywords_to_regex(keywords)
return keywords

def transform_keywords_to_regex(self, keywords):
Expand Down Expand Up @@ -254,32 +188,26 @@ def do_match(
self,
query_set,
query_string,
annotation_name,
query,
matcher,
keywords,
keyword_list: Iterable[str],
add_snippet=False,
):
expression = ExpressionWrapper(
query(query_string), output_field=TextField()
)
annotated = query_set.annotate(**{annotation_name: expression})
matches = annotated.filter(matcher(annotation_name + "__" + self.lookup))
matches = query_set.filter(matcher(f"{query_string}__{self.lookup}"))
if add_snippet:
matches = self.annotate_with_snippet(matches, keywords, query_string)
matches = self.annotate_with_snippet(matches, keyword_list, query_string)
else:
matches = matches.annotate(snippet=Value(""))
return matches

def annotate_with_snippet(self, qs, keywords, query_string):
def annotate_with_snippet(self, qs, keyword_list: Iterable[str], query_string):
return qs.annotate(
snippet=Func(
Func(
Func(
Func(
Func(
query_string,
Value(self.get_snippet_regex(keywords)),
Value(self.get_snippet_regex(keyword_list)),
Value(
r'START_SNIPPET\1<span class="search-snippet">'
r"\2</span>\3...END_SNIPPET"
Expand Down Expand Up @@ -308,43 +236,33 @@ def annotate_with_snippet(self, qs, keywords, query_string):
)
)

def get_snippet_regex(self, keywords, before=5, after=5):
def get_snippet_regex(self, keywords: Iterable[str], before=5, after=5):
"""This regex should give us three capturing groups we can use
with postgres REGEXP_REPLACE to insert <span> tags around our keywords;
e.g. REGEXP_REPLACE('content',headline_regex,'\1 <span>\2</span>\3')
"""
keywords = self.get_keywords(keywords)
words_before_group = rf"((?:\S+\s){{0,{before}}})"
keywords_group = "|".join(keywords)
keywords_group = r"(" + keywords_group + r")"
keywords_group = f"({keywords_group})"
words_after_group = rf"(.?\s(?:\S+\s){{0,{after}}})"
snippet_regex = words_before_group + keywords_group + words_after_group
return snippet_regex

def match(self, query_set, query_string, add_snippet=False):
annotation_name = "cleaned{0}".format(self.cleaned_number)
self.cleaned_number += 1
return self.do_match(
query_set,
query_string,
annotation_name,
self.basic_query,
self.nonfolded_matcher,
self.keywords,
add_snippet=add_snippet,
)

def match_folded(self, query_set, query_string, add_snippet=False):
annotation_name = "folded{0}".format(self.folded_number)
self.folded_number += 1
keywords = self.folded_keywords
return self.do_match(
query_set,
query_string,
annotation_name,
self.query,
self.folded_matcher,
keywords,
self.folded_keywords,
add_snippet=add_snippet,
)

Expand All @@ -370,7 +288,7 @@ class SearchMethodGroup:

search_types = [
("all content", None),
("original texts", ["original_texts__plain_content", "folded"]),
("original texts", ["original_texts__folded_content", "folded"]),
(
"translations",
[
Expand Down Expand Up @@ -451,8 +369,7 @@ def SEARCH_METHODS(self):
@classmethod
def generic_content_search(cls, qs, search_fields):
results = []
for field in search_fields:
field_name, match_function = field
for field_name, match_function in search_fields:
matches = match_function(qs, field_name, add_snippet=True)
results.append(matches)
# Remove objects from queryset once matched so they don't get matched twice
Expand Down Expand Up @@ -508,7 +425,7 @@ def original_text_owner_search(cls, terms, qs, search_field=None):
search_fields = [(search_field[0], match_function)]
else:
search_fields = [
("original_texts__plain_content", terms.match_folded),
("original_texts__folded_content", terms.match_folded),
("original_texts__translation__plain_translated_text", terms.match),
("plain_commentary", terms.match),
("original_texts__translation__translator_name", terms.match),
Expand Down
Loading
Loading