diff --git a/src/rard/research/migrations/0056_faceted_search.py b/src/rard/research/migrations/0056_faceted_search.py index 21e3b19e..e8ef4b7c 100644 --- a/src/rard/research/migrations/0056_faceted_search.py +++ b/src/rard/research/migrations/0056_faceted_search.py @@ -5,6 +5,10 @@ from rard.utils.text_processors import make_plain_text +def noop(_apps, _schema_editor): + return + + def save_objects_with_plain_text_fields(apps, schema_editor): db_alias = schema_editor.connection.alias Antiquarian = apps.get_model("research", "Antiquarian") @@ -105,5 +109,8 @@ class Migration(migrations.Migration): name='plain_introduction', field=models.TextField(default=''), ), - migrations.RunPython(save_objects_with_plain_text_fields), + migrations.RunPython( + code=save_objects_with_plain_text_fields, + reverse_code=noop, + ), ] diff --git a/src/rard/research/migrations/0076_add_folded_text.py b/src/rard/research/migrations/0076_add_folded_text.py new file mode 100644 index 00000000..e4e2be7f --- /dev/null +++ b/src/rard/research/migrations/0076_add_folded_text.py @@ -0,0 +1,47 @@ +# Generated by Django 3.2 on 2026-05-08 12:07 + +from django.db import migrations, models +from rard.utils.text_processors import fold_latin_and_remove_punctuation + +def noop(_apps, _schema_editor): + return + + +def add_folded_text_fields(apps, schema_editor): + db_alias = schema_editor.connection.alias + OriginalText = apps.get_model("research", "OriginalText") + for object in OriginalText.objects.using(db_alias).all(): + if object.plain_content: + object.folded_content = fold_latin_and_remove_punctuation(object.plain_content) + object.save() + HistoricalOriginalText = apps.get_model( + 'research', 'HistoricalOriginalText' + ) + for object in HistoricalOriginalText.objects.using(db_alias).all(): + if object.plain_content: + object.folded_content = fold_latin_and_remove_punctuation(object.plain_content) + object.save() + + +class Migration(migrations.Migration): + + dependencies = [ + ('research', '0075_add_testimonium_tags'), + ] + + operations = [ + migrations.AddField( + model_name='historicaloriginaltext', + name='folded_content', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='originaltext', + name='folded_content', + field=models.TextField(default=''), + ), + migrations.RunPython( + code=add_folded_text_fields, + reverse_code=noop, + ), + ] diff --git a/src/rard/research/models/original_text.py b/src/rard/research/models/original_text.py index 3072c786..7699c706 100644 --- a/src/rard/research/models/original_text.py +++ b/src/rard/research/models/original_text.py @@ -7,7 +7,10 @@ from rard.research.models.mixins import HistoryModelMixin from rard.research.models.reference import Reference from rard.utils.basemodel import BaseModel, DynamicTextField -from rard.utils.text_processors import make_plain_text +from rard.utils.text_processors import ( + fold_latin_and_remove_punctuation, + make_plain_text, +) class OriginalText(HistoryModelMixin, BaseModel): @@ -53,6 +56,9 @@ def reference_list(self): # Also store copy without html or punctuation for search purposes plain_content = models.TextField(blank=False, default="") + # Also store a copy with all folds applied + folded_content = models.TextField(blank=False, default="") + # to be nuked eventually. not required now but hidden from view # to preserve previous values in case our data migration is insufficient apparatus_criticus = DynamicTextField(default="", blank=True) @@ -71,6 +77,10 @@ def save(self, *args, **kwargs): of list items don't get merged (and other things like that)""" if self.content: self.plain_content = make_plain_text(self.content) + self.folded_content = fold_latin_and_remove_punctuation(self.plain_content) + uf = kwargs.get("update_fields") + if uf is not None and "content" in uf: + kwargs["update_fields"] = {"plain_content", "folded_content"}.union(uf) super(OriginalText, self).save(*args, **kwargs) def apparatus_criticus_lines(self): diff --git a/src/rard/research/tests/views/test_search.py b/src/rard/research/tests/views/test_search.py index e227f97a..e6464ebd 100644 --- a/src/rard/research/tests/views/test_search.py +++ b/src/rard/research/tests/views/test_search.py @@ -316,11 +316,6 @@ def do_search(search_function, keywords): self.assertEqual(do_search(view.fragment_search, "notme"), [f2]) self.assertEqual(do_search(view.fragment_search, "No!TMe"), [f2]) self.assertEqual(do_search(view.fragment_search, "*Me*"), [f1, f2]) - self.assertEqual(do_search(view.fragment_search, "may"), [f1, f2]) - self.assertEqual( - do_search(view.fragment_search, "m!£$%^&()_+-=|\\{[}];@'#<,>./ay"), - [f1, f2], - ) self.assertEqual(do_search(view.fragment_search, "mav"), []) self.assertEqual(do_search(view.fragment_search, 'alcott "louisa may"'), [f1]) self.assertEqual(do_search(view.fragment_search, 'may "louisa alcott"'), []) diff --git a/src/rard/research/views/search.py b/src/rard/research/views/search.py index da9d8892..31a0a899 100644 --- a/src/rard/research/views/search.py +++ b/src/rard/research/views/search.py @@ -7,16 +7,7 @@ from django.conf import settings from django.contrib.auth.mixins import LoginRequiredMixin -from django.db.models import ( - Expression, - ExpressionWrapper, - Func, - Q, - QuerySet, - TextField, - Value, -) -from django.db.models.functions import Lower +from django.db.models import Expression, Func, Q, QuerySet, TextField, Value from django.shortcuts import redirect from django.utils.decorators import method_decorator from django.views.decorators.http import require_GET @@ -34,46 +25,7 @@ Topic, Work, ) - -# Fold [X,Y] transforms all instances of Y into X before matching -# Folds are applied in the specified order, so we don't need -# 'uul' <- 'vul' if we already have 'u' <- 'v' -rard_folds: list[tuple[str, str]] = [ - ("ast", "a est"), - ("ost", "o est"), - ("umst", "um est"), - ("am", "an"), - ("ausa", "aussa"), - ("nn", "bn"), - ("tt", "bt"), - ("pp", "bp"), - ("rr", "br"), - ("ch", "cch"), - ("clu", "culu"), - ("claud", "clod"), - ("has", "hasce"), - ("his", "hisce"), - ("hos", "hosce"), - ("i", "ii"), - ("i", "j"), - ("um", "im"), - ("lagr", "lagl"), - ("mb", "nb"), - ("ll", "nl"), - ("mm", "nm"), - ("mp", "np"), - ("mp", "ndup"), - ("rr", "nr"), - ("um", "om"), - ("u", "v"), - ("u", "y"), - ("uu", "w"), - ("ulc", "ulch"), - ("uul", "uol"), - ("ui", "uui"), - ("uum", "uom"), - ("x", "xs"), -] +from rard.utils.text_processors import fold_latin WILDCARD_SINGLE_CHAR = settings.WILDCARD_SINGLE_CHAR WILDCARD_MANY_CHAR = settings.WILDCARD_MANY_CHAR @@ -106,57 +58,38 @@ class Term: functions relevant to them. """ - def __init__(self, keywords: str) -> None: + def __init__(self, keywords: str): """ Initialize ``Term`` with the keywords. :param keywords: The user's query; a string of keywords. """ - self.cleaned_number = 1 - self.folded_number = 1 + # Using regex for everything doesn't seem to have a big impact + # But replace this line with the alternative code if you want to + # only use regex for search terms containing wildcards + self.lookup = "iregex" + # # If wildcard characters appear in keywords, use regex lookup + # if any([char in keywords for char in CTRL_CHARS]): + # self.lookup = "iregex" + # else: + # self.lookup = "icontains" + # Remove all punctuation except wildcard characers - self.keywords = PUNCTUATION_RE.sub("", keywords).lower() + keyword_string = PUNCTUATION_RE.sub("", keywords).lower() + self.keywords = self.get_keywords(keyword_string) - # The basic function query function will first eliminate html less than - # and greater than character codes, then punctuation, - # and lowercase the 'haystack' strings to be searched. - self.basic_query: Callable[[str], Expression] = lambda q: Lower( - Func( - Func( - q, - Value("&[gl]t;"), - Value(""), - Value("g"), - function="regexp_replace", - ), - Value(PUNCTUATION), - Value(""), - function="translate", + self.folded_keywords = [fold_latin(keyword) for keyword in self.keywords] + + if self.lookup.endswith("regex"): + self.keywords = self.transform_keywords_to_regex(self.keywords) + self.folded_keywords = self.transform_keywords_to_regex( + self.folded_keywords ) - ) - self.query: Callable[[str], Expression] = self.basic_query - # Now we call add_fold repeatedly to add more - # folds to self.query - k = self.keywords - # we will add each relevant fold_to -> fold_from replacement - for fold_to, fold_from in rard_folds: - # if fold_from is in any of the keywords - if fold_from in k: - # then replace it in the keywords - k = k.replace(fold_from, fold_to) - # and get it replaced in all the strings searched. - self.add_fold(fold_from, fold_to) - # otherwise if fold_to is in the keywords - elif fold_to in k: - # get it replaced in the strings searched, - # but we don't need to replace anything in the keywords. - self.add_fold(fold_from, fold_to) - # otherwise this fold is not relevant - self.folded_keywords = k - self.folded_matcher = self.get_matcher(k) + + self.folded_matcher = self.get_matcher(self.folded_keywords) self.nonfolded_matcher = self.get_matcher(self.keywords) - def get_matcher(self, keywords: str) -> Callable[[str], Q]: + def get_matcher(self, keyword_list: Iterable[str]) -> Callable[[str], Q]: """ Get a matcher for the keyword string. @@ -164,7 +97,6 @@ def get_matcher(self, keywords: str) -> Callable[[str], Q]: :return: A function that takes a lookup string and returns an expression that matches these keywords in that lookup string. """ - keyword_list = self.get_keywords(keywords) if len(keyword_list) == 0: # want a query that will always succeed return ~Q(pk__in=[]) @@ -191,19 +123,7 @@ def add_keyword( """ return lambda f: Q(**{f: keyword}) & old(f) - def add_fold(self, fold_from: str, fold_to: str) -> None: - """ - Add another fold to ``self.query``. - - :param fold_from: The string to find and replace. - :param fold_to: The replacement string. - """ - old = self.query - self.query = lambda q: Func( - old(q), Value(fold_from), Value(fold_to), function="replace" - ) - - def get_keywords(self, search_string): + def get_keywords(self, search_string: str) -> list[str]: """ Turns a string into a series of keywords. This is mostly splitting by whitespace, but strings surrounded by double quotes are @@ -216,11 +136,11 @@ def get_keywords(self, search_string): 2. Captures everything inside double quotes 3. Captures individual words """ - # regex 1st alternative matches proximity wil + # regex 1st alternative matches proximity, 2nd quoted phrase, 3rd word keywords = re.findall( - r"(.+\s~\d?:?\d?\s.+|(?<=\")[^\"]*(?=\")|[^\s\"]+)", search_string + r"(.+\s~\d*:?\d*\s.+|(?<=\")[^\"]*(?=\")|[^\s\"]+)", search_string ) - return self.transform_keywords_to_regex(keywords) + return keywords def transform_keywords_to_regex(self, keywords): """Takes a list of keywords which may include wildcard characters @@ -285,11 +205,9 @@ def do_match( self, query_set: QuerySet, query_string: str, - annotation_name: str, - query: Callable[[str], Q], matcher: Callable[[str], Q], - keywords: str, - add_snippet: bool = False, + keyword_list: Iterable[str], + add_snippet=False, ) -> QuerySet: """ Get the queryset for this match portion. @@ -305,18 +223,16 @@ def do_match( :param add_snippet: Should we add a snippet to the resulting queryset? :return: The queryset of results and snippets. """ - expression = ExpressionWrapper( - query(query_string), output_field=TextField() - ) - annotated = query_set.alias(**{annotation_name: expression}) - matches = annotated.filter(matcher(annotation_name + "__regex")) + matches = query_set.filter(matcher(f"{query_string}__{self.lookup}")) snippet = ( - self.snippet_query(keywords, query_string) if add_snippet else Value("") + self.snippet_query(keyword_list, query_string) + if add_snippet + else Value("") ) matches = matches.annotate(snippet=snippet) return matches - def snippet_query(self, keywords: str, query_string: str) -> Expression: + def snippet_query(self, keyword_list: str, query_string: str) -> Expression: """ Get an expression for a getting a snippet. @@ -330,7 +246,7 @@ def snippet_query(self, keywords: str, query_string: str) -> Expression: Func( Func( query_string, - Value(self.get_snippet_regex(keywords)), + Value(self.get_snippet_regex(keyword_list)), Value( r'START_SNIPPET\1' r"\2\3...END_SNIPPET" @@ -358,26 +274,24 @@ def snippet_query(self, keywords: str, query_string: str) -> Expression: output_field=TextField(), ) - def get_snippet_regex( - self, keywords: str, before: int = 5, after: int = 5 - ) -> str: + def get_snippet_regex(self, keywords: Iterable[str], before=5, after=5) -> str: """ Get a regular expression that extracts a snippet from text. For example we can make an HTML snippet with the Postgres SQL ``REGEXP_REPLACE(content, snippet_regex, '\1\2\3')``. - :param keywords: String of keywords (the user's query) + :param keywords: Iterable of keywords (the user's query) -- they + have already been split by unquoted space :param before: The number of words before a keyword we'd like in the snippet. :param after: The number of words after a keyword we'd like in the snippet. :return: A regex that has three capturing groups: 1 is the previous words, 2 is the keyword that was matched, 3 is the subsequent words. """ - keywords = self.get_keywords(keywords) words_before_group = rf"((?:\S+\s){{0,{before}}})" keywords_group = "|".join(keywords) - keywords_group = r"(" + keywords_group + r")" + keywords_group = f"({keywords_group})" words_after_group = rf"(.?\s(?:\S+\s){{0,{after}}})" snippet_regex = words_before_group + keywords_group + words_after_group return snippet_regex @@ -398,13 +312,9 @@ def match( :return: The queryset of results and snippets. The snippet annotation (if present) has the name ``snippet``. """ - annotation_name = "cleaned{0}".format(self.cleaned_number) - self.cleaned_number += 1 return self.do_match( query_set, query_string, - annotation_name, - self.basic_query, self.nonfolded_matcher, self.keywords, add_snippet=add_snippet, @@ -426,16 +336,11 @@ def match_folded( :return: The queryset of results and snippets. The snippet annotation (if present) has the name ``snippet``. """ - annotation_name = "folded{0}".format(self.folded_number) - self.folded_number += 1 - keywords = self.folded_keywords return self.do_match( query_set, query_string, - annotation_name, - self.query, self.folded_matcher, - keywords, + self.folded_keywords, add_snippet=add_snippet, ) @@ -468,7 +373,7 @@ class SearchMethodGroup: search_types: list[SearchField] = [ ("all content", None), - ("original texts", ("original_texts__plain_content", "folded")), + ("original texts", ("original_texts__folded_content", "folded")), ( "translations", ( @@ -675,7 +580,7 @@ def original_text_owner_search( search_fields = [(search_field[0], match_function)] else: search_fields = [ - ("original_texts__plain_content", terms.match_folded), + ("original_texts__folded_content", terms.match_folded), ("original_texts__translation__plain_translated_text", terms.match), ("plain_commentary", terms.match), ("original_texts__translation__translator_name", terms.match), diff --git a/src/rard/utils/text_processors.py b/src/rard/utils/text_processors.py index a447e268..f4228b0b 100644 --- a/src/rard/utils/text_processors.py +++ b/src/rard/utils/text_processors.py @@ -23,3 +23,57 @@ def make_plain_text(content): no_lone_numbers = re.sub(r"\s\d{1,2}\s", " ", no_punctuation) # mentions no_excess_space = re.sub(r" +", " ", no_lone_numbers) return no_excess_space + + +# Fold [X,Y] transforms all instances of Y into X before matching +# Folds are applied in the specified order, so we don't need +# 'uul' <- 'vul' if we already have 'u' <- 'v' +rard_folds: list[tuple[str, str]] = [ + ("ast", "a est"), + ("ost", "o est"), + ("umst", "um est"), + ("am", "an"), + ("ausa", "aussa"), + ("nn", "bn"), + ("tt", "bt"), + ("pp", "bp"), + ("rr", "br"), + ("ch", "cch"), + ("clu", "culu"), + ("claud", "clod"), + ("has", "hasce"), + ("his", "hisce"), + ("hos", "hosce"), + ("i", "ii"), + ("i", "j"), + ("um", "im"), + ("lagr", "lagl"), + ("mb", "nb"), + ("ll", "nl"), + ("mm", "nm"), + ("mp", "np"), + ("mp", "ndup"), + ("rr", "nr"), + ("um", "om"), + ("u", "v"), + ("u", "y"), + ("uu", "w"), + ("ulc", "ulch"), + ("uul", "uol"), + ("ui", "uui"), + ("uum", "uom"), + ("x", "xs"), +] + + +punctuation_re = re.compile(f"(&[lg]t;)|[{re.escape(string.punctuation)}£¬]") + + +def fold_latin(content: str) -> str: + for fold_to, fold_from in rard_folds: + content = content.replace(fold_from, fold_to) + return content + + +def fold_latin_and_remove_punctuation(content: str) -> str: + return fold_latin(punctuation_re.sub("", content))