Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,11 @@
default='strict', help="How strict to check PAGE multi-level textequiv consistency")
@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency",
type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly')
@click.option('-q', '--include-file-grps', 'include_fileGrp', help="fileGrps to include", default=[], multiple=True)
@click.option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to exclude", default=[], multiple=True)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good for consistency and useful for doing targeted validation of experimental workspaces with lots of groups.

@click.argument('mets_url', default=None, required=False)
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency,
include_fileGrp, exclude_fileGrp):
"""
Validate a workspace

Expand All @@ -131,7 +134,9 @@
skip=skip,
download=download,
page_strictness=page_textequiv_consistency,
page_coordinate_consistency=page_coordinate_consistency
page_coordinate_consistency=page_coordinate_consistency,
include_fileGrp=include_fileGrp,
exclude_fileGrp=exclude_fileGrp,
)
print(report.to_xml())
if not report.is_valid:
Expand Down Expand Up @@ -316,7 +321,7 @@
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
@click.argument('file_glob', nargs=-1, required=True)
@pass_workspace
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run,

Check warning on line 324 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 324 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 324 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 324 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 324 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 324 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 324 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 324 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

C901

'workspace_cli_bulk_add' is too complex (29)
file_glob, src_path_option, ignore, force, skip):
"""
Add files in bulk to an OCR-D workspace.
Expand Down
10 changes: 9 additions & 1 deletion src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
:py:meth:`Processor.process()` was not overridden.
"""

class DummyFuture:

Check failure on line 124 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 124 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E302

expected 2 blank lines, found 1

Check failure on line 124 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 124 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 124 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 124 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E302

expected 2 blank lines, found 1

Check failure on line 124 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E302

expected 2 blank lines, found 1

Check failure on line 124 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E302

expected 2 blank lines, found 1
"""
Mimics some of `concurrent.futures.Future` but runs immediately.
"""
Expand Down Expand Up @@ -708,7 +708,7 @@
nr_errors = dict(nr_errors)
nr_all = nr_succeeded + nr_failed
if nr_failed > 0:
if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
if config.OCRD_MAX_MISSING_OUTPUTS >= 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS:
raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})")
self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors))
self._base_logger.debug("succeeded %d, missed %d of %d pages due to %s",
Expand Down Expand Up @@ -941,7 +941,7 @@
value=self.version),
LabelType(type_='ocrd/core',
value=OCRD_VERSION)])
])

Check failure on line 944 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 944 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E124

closing bracket does not match visual indentation

Check failure on line 944 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 944 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 944 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 944 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E124

closing bracket does not match visual indentation

Check failure on line 944 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E124

closing bracket does not match visual indentation

Check failure on line 944 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E124

closing bracket does not match visual indentation
metadata_obj.add_MetadataItem(metadata_item)

def resolve_resource(self, val):
Expand Down Expand Up @@ -1199,6 +1199,9 @@
if log_queue:
# replace all log handlers with just one queue handler
logging.root.handlers = [logging.handlers.QueueHandler(log_queue)]
logging.root.handlers[0].setFormatter(
# insert pageId before actual message
logging.Formatter(fmt='[%(pageId)s] %(message)s'))
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very useful for debugging. An eventual (yet another) refactoring of the logging to provide such metadata per-message throughout would be tremendous. "Wenn mal Zeit ist..."



def _page_worker(*input_files, timeout=0):
Expand All @@ -1209,6 +1212,11 @@
#_page_worker_processor.process_page_file(*input_files)
page_id = next((file.pageId for file in input_files
if hasattr(file, 'pageId')), "")
# update log records for QueueHandler formatter
def log_filter(record: logging.LogRecord):

Check failure on line 1216 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E306

expected 1 blank line before a nested definition, found 0

Check failure on line 1216 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E306

expected 1 blank line before a nested definition, found 0

Check failure on line 1216 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E306

expected 1 blank line before a nested definition, found 0

Check failure on line 1216 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E306

expected 1 blank line before a nested definition, found 0

Check failure on line 1216 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E306

expected 1 blank line before a nested definition, found 0

Check failure on line 1216 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E306

expected 1 blank line before a nested definition, found 0

Check failure on line 1216 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E306

expected 1 blank line before a nested definition, found 0

Check failure on line 1216 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E306

expected 1 blank line before a nested definition, found 0
record.pageId = page_id
return record
logging.root.handlers[0].filters = [log_filter]
if timeout:
if threading.current_thread() is not threading.main_thread():
# does not work outside of main thread
Expand Down
11 changes: 10 additions & 1 deletion src/ocrd_utils/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import logging
import logging.config
from pathlib import Path
from itertools import accumulate
import sys
from os import chmod

Expand Down Expand Up @@ -124,7 +125,15 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG):
for logger_name in logging.root.manager.loggerDict:
if not silent:
print(f'[LOGGING] Overriding {logger_name} log level to {lvl}', file=sys.stderr)
logging.getLogger(logger_name).setLevel(lvl)
if (not logger_name or logger_name.startswith('ocrd') or
# skip our default loggers (PIL etc), except for root and ocrd*
not any(prefix in LOGGING_DEFAULTS
for prefix in map(
".".join,
accumulate(
map(lambda x: (x,),
logger_name.split('.')))))):
logging.getLogger(logger_name).setLevel(lvl)


def get_logging_config_files():
Expand Down
46 changes: 41 additions & 5 deletions src/ocrd_validators/page_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,45 @@ def page_get_reading_order(ro, rogroup):
page_get_reading_order(ro, elem)


def sorted_reading_order(ro, regionlist):
regions = {region.id: region for region in regionlist}
parents = {}
def add_parents(element):
parent = element.parent_object_
if not hasattr(parent, 'id'):
# reached top ReadingOrderType
parent = None
if parent in parents:
parents[parent].append(element)
else:
parents[parent] = [element]
if isinstance(parent,
(OrderedGroupType,
OrderedGroupIndexedType,
UnorderedGroupType,
UnorderedGroupIndexedType)):
# go up
add_parents(parent)
for region in regions:
add_parents(ro[region])
result = []
def add_regionrefs(elements):
if hasattr(elements[0], "index"):
elements = sorted(elements, key=lambda ref: ref.index)
for element in elements:
ref = getattr(element, "regionRef", None)
if ref:
result.append(regions[ref])
# go down
if isinstance(element,
(OrderedGroupType,
OrderedGroupIndexedType,
UnorderedGroupType,
UnorderedGroupIndexedType)):
add_regionrefs(parents[element])
add_regionrefs(parents[None])
return result

def make_poly(polygon_points):
"""Instantiate a Polygon from a list of point pairs, or return an error string"""
if len(polygon_points) < 4:
Expand Down Expand Up @@ -298,11 +337,8 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate
continue
children = getattr(node, getter)()
if (getter == 'get_TextRegion' and children and
all(child.id in readingOrder for child in children) and
isinstance(readingOrder[children[0].id].parent_object_,
(OrderedGroupType, OrderedGroupIndexedType))):
children = sorted(children, key=lambda child:
readingOrder[child.id].index)
all(child.id in readingOrder for child in children)):
children = sorted_reading_order(readingOrder, children)
elif ((getter == 'get_TextLine' and textLineOrder == _ORDER[0][1]) or
(getter in ['get_Word', 'get_Glyph'] and readingDirection == _ORDER[0][2])):
children = list(reversed(children))
Expand Down
Loading