diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 07e21b5c4..25e8077a9 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -107,8 +107,11 @@ def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup): default='strict', help="How strict to check PAGE multi-level textequiv consistency") @click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly') +@click.option('-q', '--include-file-grps', 'include_fileGrp', help="fileGrps to include", default=[], multiple=True) +@click.option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to exclude", default=[], multiple=True) @click.argument('mets_url', default=None, required=False) -def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency): +def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency, + include_fileGrp, exclude_fileGrp): """ Validate a workspace @@ -131,7 +134,9 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency skip=skip, download=download, page_strictness=page_textequiv_consistency, - page_coordinate_consistency=page_coordinate_consistency + page_coordinate_consistency=page_coordinate_consistency, + include_fileGrp=include_fileGrp, + exclude_fileGrp=exclude_fileGrp, ) print(report.to_xml()) if not report.is_valid: diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index d8e52dac4..061eace78 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -708,7 +708,7 @@ def process_workspace_handle_tasks(self, tasks: Dict[TFuture, Tuple[str, List[Op nr_errors = dict(nr_errors) nr_all = nr_succeeded + nr_failed if nr_failed > 0: - if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: + if config.OCRD_MAX_MISSING_OUTPUTS >= 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})") self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) self._base_logger.debug("succeeded %d, missed %d of %d pages due to %s", @@ -1199,6 +1199,9 @@ def _page_worker_set_ctxt(processor, log_queue): if log_queue: # replace all log handlers with just one queue handler logging.root.handlers = [logging.handlers.QueueHandler(log_queue)] + logging.root.handlers[0].setFormatter( + # insert pageId before actual message + logging.Formatter(fmt='[%(pageId)s] %(message)s')) def _page_worker(*input_files, timeout=0): @@ -1209,6 +1212,11 @@ def _page_worker(*input_files, timeout=0): #_page_worker_processor.process_page_file(*input_files) page_id = next((file.pageId for file in input_files if hasattr(file, 'pageId')), "") + # update log records for QueueHandler formatter + def log_filter(record: logging.LogRecord): + record.pageId = page_id + return record + logging.root.handlers[0].filters = [log_filter] if timeout: if threading.current_thread() is not threading.main_thread(): # does not work outside of main thread diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 76b16460c..f703f7f4c 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -31,6 +31,7 @@ import logging import logging.config from pathlib import Path +from itertools import accumulate import sys from os import chmod @@ -124,7 +125,15 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): for logger_name in logging.root.manager.loggerDict: if not silent: print(f'[LOGGING] Overriding {logger_name} log level to {lvl}', file=sys.stderr) - logging.getLogger(logger_name).setLevel(lvl) + if (not logger_name or logger_name.startswith('ocrd') or + # skip our default loggers (PIL etc), except for root and ocrd* + not any(prefix in LOGGING_DEFAULTS + for prefix in map( + ".".join, + accumulate( + map(lambda x: (x,), + logger_name.split('.')))))): + logging.getLogger(logger_name).setLevel(lvl) def get_logging_config_files(): diff --git a/src/ocrd_validators/page_validator.py b/src/ocrd_validators/page_validator.py index 0d4e6666c..f89e35b92 100644 --- a/src/ocrd_validators/page_validator.py +++ b/src/ocrd_validators/page_validator.py @@ -202,6 +202,45 @@ def page_get_reading_order(ro, rogroup): page_get_reading_order(ro, elem) +def sorted_reading_order(ro, regionlist): + regions = {region.id: region for region in regionlist} + parents = {} + def add_parents(element): + parent = element.parent_object_ + if not hasattr(parent, 'id'): + # reached top ReadingOrderType + parent = None + if parent in parents: + parents[parent].append(element) + else: + parents[parent] = [element] + if isinstance(parent, + (OrderedGroupType, + OrderedGroupIndexedType, + UnorderedGroupType, + UnorderedGroupIndexedType)): + # go up + add_parents(parent) + for region in regions: + add_parents(ro[region]) + result = [] + def add_regionrefs(elements): + if hasattr(elements[0], "index"): + elements = sorted(elements, key=lambda ref: ref.index) + for element in elements: + ref = getattr(element, "regionRef", None) + if ref: + result.append(regions[ref]) + # go down + if isinstance(element, + (OrderedGroupType, + OrderedGroupIndexedType, + UnorderedGroupType, + UnorderedGroupIndexedType)): + add_regionrefs(parents[element]) + add_regionrefs(parents[None]) + return result + def make_poly(polygon_points): """Instantiate a Polygon from a list of point pairs, or return an error string""" if len(polygon_points) < 4: @@ -298,11 +337,8 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate continue children = getattr(node, getter)() if (getter == 'get_TextRegion' and children and - all(child.id in readingOrder for child in children) and - isinstance(readingOrder[children[0].id].parent_object_, - (OrderedGroupType, OrderedGroupIndexedType))): - children = sorted(children, key=lambda child: - readingOrder[child.id].index) + all(child.id in readingOrder for child in children)): + children = sorted_reading_order(readingOrder, children) elif ((getter == 'get_TextLine' and textLineOrder == _ORDER[0][1]) or (getter in ['get_Word', 'get_Glyph'] and readingDirection == _ORDER[0][2])): children = list(reversed(children))