From 2257876273438c4d3463d225f54fa03c79b876bb Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 10 Aug 2020 16:50:40 +0200 Subject: [PATCH 01/70] utils: implement resource lookup logic --- ocrd_utils/ocrd_utils/__init__.py | 1 + ocrd_utils/ocrd_utils/constants.py | 8 ++++++++ ocrd_utils/ocrd_utils/os.py | 21 +++++++++++++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index 3e882d83db..5b51ac92a4 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -125,6 +125,7 @@ from .os import ( abspath, + list_resource_candidates, pushd_popd, unzip_file_to_dir) diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index b300ddcb5d..a7d38b8f32 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -2,6 +2,8 @@ Constants for ocrd_utils. """ from pkg_resources import get_distribution +import os +from os.path import join, expanduser __all__ = [ 'EXT_TO_MIME', @@ -13,6 +15,8 @@ 'PIL_TO_MIME', 'REGEX_PREFIX', 'VERSION', + 'XDG_CONFIG_HOME', + 'XDG_DATA_HOME', ] VERSION = get_distribution('ocrd_utils').version @@ -84,3 +88,7 @@ # Log level format implementing https://ocr-d.de/en/spec/cli#logging LOG_FORMAT = r'%(asctime)s.%(msecs)03d %(levelname)s %(name)s - %(message)s' LOG_TIMEFMT = r'%H:%M:%S' + +# See https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html +XDG_DATA_HOME = os.environ['XDG_DATA_HOME'] if 'XDG_DATA_HOME' in os.environ else join(expanduser('~'), '.local', 'share') +XDG_CONFIG_HOME = os.environ['XDG_CONFIG_HOME'] if 'XDG_CONFIG_HOME' in os.environ else join(expanduser('~'), '.config') diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 4c48818fa6..86d6557636 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -5,14 +5,17 @@ 'abspath', 'pushd_popd', 'unzip_file_to_dir', + 'list_resource_candidates' ] import contextlib from os import getcwd, chdir +from os.path import join, expanduser import os.path - from zipfile import ZipFile +from .constants import XDG_DATA_HOME, XDG_CONFIG_HOME + def abspath(url): """ Get a full path to a file or file URL @@ -45,4 +48,18 @@ def unzip_file_to_dir(path_to_zip, output_directory): z.extractall(output_directory) z.close() - +def list_resource_candidates(executable, param, fname, cwd=os.getcwd()): + """ + Generate candidates for processor resources according to + https://ocr-d.de/en/spec/ocrd_tool#file-parameters + """ + candidates = [] + candidates.append(join(cwd, fname)) + processor_path_var = '%s_PATH' % executable.replace('-', '_').upper() + if processor_path_var in os.environ: + candidates += [join(x, fname) for x in os.environ[processor_path_var].split(':')] + if 'VIRTUAL_ENV' in os.environ: + candidates.append(join(os.environ['VIRTUAL_ENV'], 'share', executable, fname)) + candidates.append(join(XDG_DATA_HOME), executable, fname) + candidates.append(join(XDG_CONFIG_HOME), executable, fname) + return candidates From a7b80012474619da1aa467e7911473ac2906fb15 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 10 Aug 2020 17:14:05 +0200 Subject: [PATCH 02/70] utils: list_all_resources to list all processor resources --- ocrd_utils/ocrd_utils/__init__.py | 1 + ocrd_utils/ocrd_utils/os.py | 32 ++++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index 5b51ac92a4..f8e7ad2532 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -125,6 +125,7 @@ from .os import ( abspath, + list_all_resources, list_resource_candidates, pushd_popd, unzip_file_to_dir) diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 86d6557636..7675f8e167 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -10,7 +10,7 @@ import contextlib from os import getcwd, chdir -from os.path import join, expanduser +from os.path import join, expanduser, isdir, exists import os.path from zipfile import ZipFile @@ -48,10 +48,10 @@ def unzip_file_to_dir(path_to_zip, output_directory): z.extractall(output_directory) z.close() -def list_resource_candidates(executable, param, fname, cwd=os.getcwd()): +def list_resource_candidates(executable, fname, cwd=os.getcwd()): """ Generate candidates for processor resources according to - https://ocr-d.de/en/spec/ocrd_tool#file-parameters + https://ocr-d.de/en/spec/ocrd_tool#file-parameters (except python-bundled) """ candidates = [] candidates.append(join(cwd, fname)) @@ -63,3 +63,29 @@ def list_resource_candidates(executable, param, fname, cwd=os.getcwd()): candidates.append(join(XDG_DATA_HOME), executable, fname) candidates.append(join(XDG_CONFIG_HOME), executable, fname) return candidates + +def list_all_resources(executable): + """ + List all processor resources in the filesystem according to + https://ocr-d.de/en/spec/ocrd_tool#file-parameters (except python-bundled) + """ + candidates = [] + # XXX this will produce too many false positives + # for root, dirs, files in os.walk(cwd): + # candidates += files + processor_path_var = '%s_PATH' % executable.replace('-', '_').upper() + if processor_path_var in os.environ: + for processor_path in os.environ[processor_path_var].split(':'): + if isdir(processor_path): + for root, dirs, files in os.walk(processor_path): + candidates += files + if 'VIRTUAL_ENV' in os.environ: + sharedir = join(os.environ['VIRTUAL_ENV'], 'share', executable) + if isdir(sharedir): + for root, dirs, files in os.walk(sharedir): + candidates += files + for xdgdir in [join(d, executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME]]: + if isdir(xdgdir): + for root, dirs, files in os.walk(xdgdir): + candidates += files + return candidates From 7e362d7dcc9a06edcec9f0d82697df522298d201 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 10 Aug 2020 17:43:03 +0200 Subject: [PATCH 03/70] ocrd_utils.constants: XDG_CACHE_HOME --- ocrd_utils/ocrd_utils/__init__.py | 13 ++++++++----- ocrd_utils/ocrd_utils/constants.py | 2 ++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index f8e7ad2532..659f5fc63a 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -69,15 +69,18 @@ """ from .constants import ( - VERSION, - MIMETYPE_PAGE, EXT_TO_MIME, + LOG_FORMAT, + LOG_TIMEFMT, + MIMETYPE_PAGE, MIME_TO_EXT, - PIL_TO_MIME, MIME_TO_PIL, + PIL_TO_MIME, REGEX_PREFIX, - LOG_FORMAT, - LOG_TIMEFMT) + VERSION, + XDG_CACHE_HOME, + XDG_CONFIG_HOME, + XDG_DATA_HOME) from .deprecate import ( deprecated_alias) diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index a7d38b8f32..e096e39d99 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -17,6 +17,7 @@ 'VERSION', 'XDG_CONFIG_HOME', 'XDG_DATA_HOME', + 'XDG_CACHE_HOME', ] VERSION = get_distribution('ocrd_utils').version @@ -92,3 +93,4 @@ # See https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html XDG_DATA_HOME = os.environ['XDG_DATA_HOME'] if 'XDG_DATA_HOME' in os.environ else join(expanduser('~'), '.local', 'share') XDG_CONFIG_HOME = os.environ['XDG_CONFIG_HOME'] if 'XDG_CONFIG_HOME' in os.environ else join(expanduser('~'), '.config') +XDG_CACHE_HOME = os.environ['XDG_CACHE_HOME'] if 'XDG_CACHE_HOME' in os.environ else join(expanduser('~'), '.cache') From 78e84a2bfe19508f066c08c5dd9535d72904f93b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 10 Aug 2020 18:27:52 +0200 Subject: [PATCH 04/70] list_all_resources: also look in XDG_CACHE_HOME --- ocrd_utils/ocrd_utils/os.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 7675f8e167..0e85bd7ed3 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -14,7 +14,7 @@ import os.path from zipfile import ZipFile -from .constants import XDG_DATA_HOME, XDG_CONFIG_HOME +from .constants import XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME def abspath(url): """ @@ -84,7 +84,7 @@ def list_all_resources(executable): if isdir(sharedir): for root, dirs, files in os.walk(sharedir): candidates += files - for xdgdir in [join(d, executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME]]: + for xdgdir in [join(d, executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME]]: if isdir(xdgdir): for root, dirs, files in os.walk(xdgdir): candidates += files From 5c75f40809f1365e37356f6706f1e5fd91f534cb Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 10 Aug 2020 18:28:13 +0200 Subject: [PATCH 05/70] Processor: implement resolve_resource and list_all_resources --- ocrd/ocrd/processor/base.py | 75 +++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 3 deletions(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 65ec88bfa0..e5783d3fc1 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -2,11 +2,31 @@ Processor base class and helper functions """ -__all__ = ['Processor', 'generate_processor_help', 'run_cli', 'run_processo'] +__all__ = [ + 'Processor', + 'generate_processor_help', + 'run_cli', + 'run_processo' +] -import os +from os import makedirs +from os.path import exists, isdir, join +from pkg_resources import resource_filename +from shutil import copyfileobj import json -from ocrd_utils import getLogger, VERSION as OCRD_VERSION, MIMETYPE_PAGE +import os +import re + +import requests + +from ocrd_utils import ( + getLogger, + VERSION as OCRD_VERSION, + MIMETYPE_PAGE, + list_resource_candidates, + list_all_resources, + XDG_CACHE_HOME +) from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType @@ -84,6 +104,7 @@ def process(self): """ raise Exception("Must be implemented") + def add_metadata(self, pcgts): """ Adds PAGE-XML MetadataItem describing the processing step @@ -99,6 +120,54 @@ def add_metadata(self, pcgts): value=self.parameter[name]) for name in self.parameter.keys()])])) + def resolve_resource(self, parameter_name, val): + """ + Resolve a resource name with the algorithm in + https://ocr-d.de/en/spec/ocrd_tool#file-parameters + + Args: + parameter_name (string): name of parameter to resolve resource for + val (string): resource value to resolve + """ + executable = self.ocrd_tool['executable'] + try: + param = self.ocrd_tool['parameter'][parameter_name] + except KeyError: + raise ValueError("Parameter '%s' not defined in ocrd-tool.json" % parameter_name) + if not param['mimetype']: + raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" % + parameter_name) + if val.startswith('http:') or val.startswith('https:'): + cache_dir = join(XDG_CACHE_HOME, executable) + cache_key = re.sub('[^A-Za-z0-9]', '', val) + cache_fpath = join(cache_dir, cache_key) + # TODO Proper caching (make head request for size, If-Modified etc) + if not exists(cache_fpath): + if not isdir(cache_dir): + makedirs(cache_dir) + with requests.get(val, stream=True) as r: + with open(cache_fpath, 'wb') as f: + copyfileobj(r.raw, f) + return cache_fpath + ret = next([cand + for cand + in list_resource_candidates(executable, param, val) + if exists(cand) + ]) + if ret: + return ret + bundled_fpath = resource_filename(__name__, val) + if exists(bundled_fpath): + return bundled_fpath + raise FileNotFoundError("Could not resolve '%s' file parameter value '%s'" % + (parameter_name, val)) + + def list_all_resources(self): + """ + List all resources found in the filesystem + """ + return list_all_resources(self.ocrd_tool['executable']) + @property def input_files(self): """ From bb63210ed184f2dd012c2de0d5273f6d7ad0332d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 10 Aug 2020 18:34:41 +0200 Subject: [PATCH 06/70] resolve_resource: also look in XDG_CACHE_HOME --- ocrd_utils/ocrd_utils/os.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 0e85bd7ed3..69ac1b0184 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -62,6 +62,7 @@ def list_resource_candidates(executable, fname, cwd=os.getcwd()): candidates.append(join(os.environ['VIRTUAL_ENV'], 'share', executable, fname)) candidates.append(join(XDG_DATA_HOME), executable, fname) candidates.append(join(XDG_CONFIG_HOME), executable, fname) + candidates.append(join(XDG_CACHE_HOME), executable, fname) return candidates def list_all_resources(executable): From 297a0f30d22b95175bef0415edfe6943a988bc4b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 10 Aug 2020 18:38:59 +0200 Subject: [PATCH 07/70] Processor: fix signature for list_resource_candidates --- ocrd/ocrd/processor/base.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index e5783d3fc1..c14f1db9bf 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -6,16 +6,16 @@ 'Processor', 'generate_processor_help', 'run_cli', - 'run_processo' + 'run_processor' ] from os import makedirs from os.path import exists, isdir, join -from pkg_resources import resource_filename from shutil import copyfileobj import json import os import re +from pkg_resources import resource_filename import requests @@ -149,11 +149,7 @@ def resolve_resource(self, parameter_name, val): with open(cache_fpath, 'wb') as f: copyfileobj(r.raw, f) return cache_fpath - ret = next([cand - for cand - in list_resource_candidates(executable, param, val) - if exists(cand) - ]) + ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)]) if ret: return ret bundled_fpath = resource_filename(__name__, val) From c999229bcc1ee71dfc0685b1e3b76c720d0a7d46 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 25 Aug 2020 17:59:46 +0200 Subject: [PATCH 08/70] initial test of list_resource_candidates --- ocrd_utils/ocrd_utils/os.py | 6 ++--- tests/processor/test_processor.py | 3 +++ tests/utils/__init__.py | 0 tests/utils/test_os.py | 43 +++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 tests/utils/__init__.py create mode 100644 tests/utils/test_os.py diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 69ac1b0184..19e9cfa9aa 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -60,9 +60,9 @@ def list_resource_candidates(executable, fname, cwd=os.getcwd()): candidates += [join(x, fname) for x in os.environ[processor_path_var].split(':')] if 'VIRTUAL_ENV' in os.environ: candidates.append(join(os.environ['VIRTUAL_ENV'], 'share', executable, fname)) - candidates.append(join(XDG_DATA_HOME), executable, fname) - candidates.append(join(XDG_CONFIG_HOME), executable, fname) - candidates.append(join(XDG_CACHE_HOME), executable, fname) + candidates.append(join(XDG_DATA_HOME, executable, fname)) + candidates.append(join(XDG_CONFIG_HOME, executable, fname)) + candidates.append(join(XDG_CACHE_HOME, executable, fname)) return candidates def list_all_resources(executable): diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index f2e91f6de4..b97cefb1d9 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -89,5 +89,8 @@ def test_run_cli(self): resolver=Resolver(), ) + def test_resolve_files(self): + pass + if __name__ == "__main__": main() diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py new file mode 100644 index 0000000000..01c043715e --- /dev/null +++ b/tests/utils/test_os.py @@ -0,0 +1,43 @@ +from tempfile import mkdtemp +from tests.base import TestCase, main, assets +from shutil import rmtree +from os import environ as ENV, getcwd +from os.path import expanduser, join + +from ocrd_utils.os import ( + list_resource_candidates +) + +class TestOsUtils(TestCase): + + def setUp(self): + self.tempdir_path = mkdtemp() + self.tempdir_venv = mkdtemp() + ENV['OCRD_DUMMY_PATH'] = self.tempdir_path + self.VIRTUAL_ENV = ENV['VIRTUAL_ENV'] + ENV['VIRTUAL_ENV'] = self.tempdir_venv + + def tearDown(self): + rmtree(self.tempdir_path) + rmtree(self.tempdir_venv) + del ENV['OCRD_DUMMY_PATH'] + ENV['VIRTUAL_ENV'] = self.VIRTUAL_ENV + + def test_resolve_basic(self): + fname = 'foo.bar' + cands = list_resource_candidates('ocrd-dummy', fname) + print(getcwd()) + cands = [x.replace(getcwd(), '$PWD').replace(expanduser('~'), '$HOME') for x in cands] + self.assertEqual(cands, [join(x, fname) for x in [ + '$PWD', + self.tempdir_path, + join(self.tempdir_venv, 'share', 'ocrd-dummy'), + '$HOME/.local/share/ocrd-dummy', + '$HOME/.config/ocrd-dummy', + '$HOME/.cache/ocrd-dummy', + ]]) + + + +if __name__ == '__main__': + main(__file__) From 479cedd4deb5abbd3699f569e114372bb9d8bf4c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 13 Oct 2020 15:59:08 +0200 Subject: [PATCH 09/70] test_os: not all test environments have VIRTUAL_ENV set --- tests/utils/test_os.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py index 01c043715e..7c14bb5be5 100644 --- a/tests/utils/test_os.py +++ b/tests/utils/test_os.py @@ -14,14 +14,17 @@ def setUp(self): self.tempdir_path = mkdtemp() self.tempdir_venv = mkdtemp() ENV['OCRD_DUMMY_PATH'] = self.tempdir_path - self.VIRTUAL_ENV = ENV['VIRTUAL_ENV'] + self.VIRTUAL_ENV = ENV.get('VIRTUAL_ENV') ENV['VIRTUAL_ENV'] = self.tempdir_venv def tearDown(self): rmtree(self.tempdir_path) rmtree(self.tempdir_venv) del ENV['OCRD_DUMMY_PATH'] - ENV['VIRTUAL_ENV'] = self.VIRTUAL_ENV + if self.VIRTUAL_ENV: + ENV['VIRTUAL_ENV'] = self.VIRTUAL_ENV + else: + del ENV['VIRTUAL_ENV'] def test_resolve_basic(self): fname = 'foo.bar' From 0d97c2dcb7e6360927a30f062656631cf03bbb85 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Oct 2020 16:57:25 +0100 Subject: [PATCH 10/70] wip --- ocrd_utils/ocrd_utils/constants.py | 12 ++++++++---- tests/utils/test_os.py | 12 +++++++----- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index 33cafb5a97..38e050de91 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -3,7 +3,7 @@ """ from pkg_resources import get_distribution from re import compile as regex_compile -import os +from os import environ from os.path import join, expanduser __all__ = [ @@ -98,6 +98,10 @@ LOG_TIMEFMT = r'%H:%M:%S' # See https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html -XDG_DATA_HOME = os.environ['XDG_DATA_HOME'] if 'XDG_DATA_HOME' in os.environ else join(expanduser('~'), '.local', 'share') -XDG_CONFIG_HOME = os.environ['XDG_CONFIG_HOME'] if 'XDG_CONFIG_HOME' in os.environ else join(expanduser('~'), '.config') -XDG_CACHE_HOME = os.environ['XDG_CACHE_HOME'] if 'XDG_CACHE_HOME' in os.environ else join(expanduser('~'), '.cache') +if 'HOME' in environ and environ['HOME'] != expanduser('~'): + HOME = environ['HOME'] +else: + HOME = expanduser('~') +XDG_DATA_HOME = environ['XDG_DATA_HOME'] if 'XDG_DATA_HOME' in environ else join(HOME, '.local', 'share') +XDG_CONFIG_HOME = environ['XDG_CONFIG_HOME'] if 'XDG_CONFIG_HOME' in environ else join(HOME, '.config') +XDG_CACHE_HOME = environ['XDG_CACHE_HOME'] if 'XDG_CACHE_HOME' in environ else join(HOME, '.cache') diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py index 7c14bb5be5..f2cd6efec9 100644 --- a/tests/utils/test_os.py +++ b/tests/utils/test_os.py @@ -27,14 +27,16 @@ def tearDown(self): del ENV['VIRTUAL_ENV'] def test_resolve_basic(self): + def dehomify(s): + return s.replace(ENV['HOME'], '$HOME').replace(expanduser('~'), '$HOME') fname = 'foo.bar' cands = list_resource_candidates('ocrd-dummy', fname) - print(getcwd()) - cands = [x.replace(getcwd(), '$PWD').replace(expanduser('~'), '$HOME') for x in cands] + cands = [dehomify(x) for x in cands] + print(cands) self.assertEqual(cands, [join(x, fname) for x in [ - '$PWD', - self.tempdir_path, - join(self.tempdir_venv, 'share', 'ocrd-dummy'), + dehomify(getcwd()), + dehomify(self.tempdir_path), + dehomify(join(self.tempdir_venv, 'share', 'ocrd-dummy')), '$HOME/.local/share/ocrd-dummy', '$HOME/.config/ocrd-dummy', '$HOME/.cache/ocrd-dummy', From 6e7665327984f8fcb7141b4fcd066f439155dfdb Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 11 Dec 2020 13:47:58 +0100 Subject: [PATCH 11/70] fixes merge error {f,}chmod --- ocrd/ocrd/processor/base.py | 1 + ocrd_utils/ocrd_utils/os.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 673412a587..0bdd95e517 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -22,6 +22,7 @@ from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, + getLogger, list_resource_candidates, list_all_resources, XDG_CACHE_HOME diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index f33343fe52..64a27c11d8 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -11,7 +11,7 @@ from tempfile import TemporaryDirectory import contextlib -from os import getcwd, chdir, stat, fchmod, umask, environ, walk +from os import getcwd, chdir, stat, chmod, umask, environ, walk from os.path import exists, abspath as abspath_, join, isdir from zipfile import ZipFile From afcd117285a84b1122bd6dc55b09b90302bf9b9c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 15 Dec 2020 13:11:19 +0100 Subject: [PATCH 12/70] run non-logging unit tests with standard $HOME --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index da8aaa3c66..6a278a4445 100644 --- a/Makefile +++ b/Makefile @@ -148,7 +148,7 @@ assets-server: test: assets HOME=$(CURDIR)/ocrd_utils $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging $(TESTDIR) HOME=$(CURDIR) $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging $(TESTDIR) - HOME=$(CURDIR) $(PYTHON) -m pytest --continue-on-collection-errors --ignore=$(TESTDIR)/test_logging.py $(TESTDIR) + $(PYTHON) -m pytest --continue-on-collection-errors --ignore=$(TESTDIR)/test_logging.py $(TESTDIR) test-profile: $(PYTHON) -m cProfile -o profile $$(which pytest) From a3226b1ee353c56c0bca452ab88338bc042036be Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Dec 2020 18:10:02 +0100 Subject: [PATCH 13/70] implement -C/-L cmdline flags --- Makefile | 4 ++-- ocrd/ocrd/decorators/__init__.py | 13 +++++++++++-- ocrd/ocrd/decorators/ocrd_cli_options.py | 3 +++ ocrd/ocrd/processor/base.py | 20 +++++++++++++++++++- ocrd/ocrd/processor/helpers.py | 4 ++++ ocrd_utils/ocrd_utils/os.py | 7 ++++++- 6 files changed, 45 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 6a278a4445..820629b31e 100644 --- a/Makefile +++ b/Makefile @@ -64,12 +64,12 @@ deps-ubuntu: # Install test python deps via pip deps-test: - $(PIP) install -U "pip>=19.0.0" + $(PIP) install -U "pip>=19.0.0,!=20.3.2" $(PIP) install -r requirements_test.txt # (Re)install the tool install: - $(PIP) install -U "pip>=19.0.0" wheel + $(PIP) install -U "pip>=19.0.0,!=20.3.2" wheel for mod in $(BUILD_ORDER);do (cd $$mod ; $(PIP_INSTALL) .);done # Install with pip install -e diff --git a/ocrd/ocrd/decorators/__init__.py b/ocrd/ocrd/decorators/__init__.py index e54f24f388..6469460227 100644 --- a/ocrd/ocrd/decorators/__init__.py +++ b/ocrd/ocrd/decorators/__init__.py @@ -28,13 +28,22 @@ def ocrd_cli_wrap_processor( help=False, # pylint: disable=redefined-builtin version=False, overwrite=False, + show_resource=None, + list_resources=False, **kwargs ): if not sys.argv[1:]: processorClass(workspace=None, show_help=True) sys.exit(1) - if dump_json or help or version: - processorClass(workspace=None, dump_json=dump_json, show_help=help, show_version=version) + if dump_json or help or version or show_resource or list_resources: + processorClass( + workspace=None, + dump_json=dump_json, + show_help=help, + show_version=version, + show_resource=show_resource, + list_resources=list_resources + ) sys.exit() else: initLogging() diff --git a/ocrd/ocrd/decorators/ocrd_cli_options.py b/ocrd/ocrd/decorators/ocrd_cli_options.py index 27a3b5b10a..9f7f8cafa9 100644 --- a/ocrd/ocrd/decorators/ocrd_cli_options.py +++ b/ocrd/ocrd/decorators/ocrd_cli_options.py @@ -15,6 +15,7 @@ def ocrd_cli_options(f): def cli(mets_url): print(mets_url) """ + # XXX Note that the `--help` output is statically generate_processor_help params = [ option('-m', '--mets', help="METS to process", default="mets.xml"), option('-w', '--working-dir', help="Working Directory"), @@ -25,6 +26,8 @@ def cli(mets_url): option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT'), option('-g', '--page-id', help="ID(s) of the pages to process"), option('--overwrite', help="Overwrite the output file group or a page range (--page-id)", is_flag=True, default=False), + option('-C', '--show-resource', help='Dump the content of processor resource RESNAME', metavar='RESNAME'), + option('-L', '--list-resources', is_flag=True, default=False, help='List names of processor resources'), parameter_option, parameter_override_option, option('-J', '--dump-json', help="Dump tool description as JSON and exit", is_flag=True, default=False), diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 0bdd95e517..9f2a9345f5 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -15,6 +15,7 @@ import json import os import re +import sys from pkg_resources import resource_filename import requests @@ -23,6 +24,7 @@ VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger, + initLogging, list_resource_candidates, list_all_resources, XDG_CACHE_HOME @@ -53,6 +55,8 @@ def __init__( input_file_grp="INPUT", output_file_grp="OUTPUT", page_id=None, + show_resource=None, + list_resources=False, show_help=False, show_version=False, dump_json=False, @@ -63,6 +67,20 @@ def __init__( if dump_json: print(json.dumps(ocrd_tool, indent=True)) return + if list_resources: + for res in list_all_resources(ocrd_tool['executable']): + print(res) + return + if show_resource: + res_fname = list_resource_candidates(ocrd_tool['executable'], show_resource, is_file=True) + if not res_fname: + initLogging() + logger = getLogger('ocrd.%s.__init__' % ocrd_tool['executable']) + logger.error("Failed to resolve %s for processort %s" % (show_resource, ocrd_tool['executable'])) + else: + with open(res_fname[0], 'rb') as f: + copyfileobj(f, sys.stdout.buffer) + return self.ocrd_tool = ocrd_tool if show_help: self.show_help() @@ -130,7 +148,7 @@ def add_metadata(self, pcgts): def resolve_resource(self, parameter_name, val): """ - Resolve a resource name with the algorithm in + Resolve a resource name to an absolute file path with the algorithm in https://ocr-d.de/en/spec/ocrd_tool#file-parameters Args: diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 3f6c8e4309..51a02da496 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -34,6 +34,8 @@ def run_processor( log_level=None, # TODO actually use this! input_file_grp=None, output_file_grp=None, + show_resource=None, + list_resources=False, parameter=None, parameter_override=None, working_dir=None, @@ -183,6 +185,8 @@ def wrap(s): -w, --working-dir PATH Working directory of local workspace -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE] Log level + -C, --show-resource RESNAME Dump the content of processor resource RESNAME + -L, --list-resources List names of processor resources -J, --dump-json Dump tool description as JSON and exit -h, --help This help message -V, --version Show version diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 64a27c11d8..c388b1d6e2 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -12,6 +12,7 @@ from tempfile import TemporaryDirectory import contextlib from os import getcwd, chdir, stat, chmod, umask, environ, walk +from pathlib import Path from os.path import exists, abspath as abspath_, join, isdir from zipfile import ZipFile @@ -58,7 +59,7 @@ def unzip_file_to_dir(path_to_zip, output_directory): z.extractall(output_directory) z.close() -def list_resource_candidates(executable, fname, cwd=getcwd()): +def list_resource_candidates(executable, fname, cwd=getcwd(), is_file=False, is_dir=False): """ Generate candidates for processor resources according to https://ocr-d.de/en/spec/ocrd_tool#file-parameters (except python-bundled) @@ -73,6 +74,10 @@ def list_resource_candidates(executable, fname, cwd=getcwd()): candidates.append(join(XDG_DATA_HOME, executable, fname)) candidates.append(join(XDG_CONFIG_HOME, executable, fname)) candidates.append(join(XDG_CACHE_HOME, executable, fname)) + if is_file: + candidates = [c for c in candidates if Path(c).is_file()] + if is_dir: + candidates = [c for c in candidates if Path(c).is_dir()] return candidates def list_all_resources(executable): From c2e0460a2df816c586baa4c378718b4693cbc2e9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 21 Dec 2020 17:26:47 +0100 Subject: [PATCH 14/70] schema for resource list --- ocrd_validators/ocrd_validators/__init__.py | 2 ++ ocrd_validators/ocrd_validators/constants.py | 2 ++ .../ocrd_validators/resource_list.schema.yml | 30 +++++++++++++++++++ .../resource_list_validator.py | 24 +++++++++++++++ .../validator/test_resource_list_validator.py | 27 +++++++++++++++++ 5 files changed, 85 insertions(+) create mode 100644 ocrd_validators/ocrd_validators/resource_list.schema.yml create mode 100644 ocrd_validators/ocrd_validators/resource_list_validator.py create mode 100644 tests/validator/test_resource_list_validator.py diff --git a/ocrd_validators/ocrd_validators/__init__.py b/ocrd_validators/ocrd_validators/__init__.py index bbf88323af..4819017dd0 100644 --- a/ocrd_validators/ocrd_validators/__init__.py +++ b/ocrd_validators/ocrd_validators/__init__.py @@ -6,6 +6,7 @@ 'WorkspaceValidator', 'PageValidator', 'OcrdToolValidator', + 'OcrdResourceListValidator', 'OcrdZipValidator', 'XsdValidator', 'XsdMetsValidator', @@ -16,6 +17,7 @@ from .workspace_validator import WorkspaceValidator from .page_validator import PageValidator from .ocrd_tool_validator import OcrdToolValidator +from .resource_list_validator import OcrdResourceListValidator from .ocrd_zip_validator import OcrdZipValidator from .xsd_validator import XsdValidator from .xsd_mets_validator import XsdMetsValidator diff --git a/ocrd_validators/ocrd_validators/constants.py b/ocrd_validators/ocrd_validators/constants.py index 6ca02c7075..25d2e0e53b 100644 --- a/ocrd_validators/ocrd_validators/constants.py +++ b/ocrd_validators/ocrd_validators/constants.py @@ -6,6 +6,7 @@ __all__ = [ 'OCRD_TOOL_SCHEMA', + 'RESOURCE_LIST_SCHEMA', 'OCRD_BAGIT_PROFILE', 'BAGIT_TXT', 'FILE_GROUP_PREFIX', @@ -18,6 +19,7 @@ ] OCRD_TOOL_SCHEMA = yaml.safe_load(resource_string(__name__, 'ocrd_tool.schema.yml')) +RESOURCE_LIST_SCHEMA = yaml.safe_load(resource_string(__name__, 'resource_list.schema.yml')) OCRD_BAGIT_PROFILE = yaml.safe_load(resource_string(__name__, 'bagit-profile.yml')) BAGIT_TXT = 'BagIt-Version: 1.0\nTag-File-Character-Encoding: UTF-8' diff --git a/ocrd_validators/ocrd_validators/resource_list.schema.yml b/ocrd_validators/ocrd_validators/resource_list.schema.yml new file mode 100644 index 0000000000..82d45aa875 --- /dev/null +++ b/ocrd_validators/ocrd_validators/resource_list.schema.yml @@ -0,0 +1,30 @@ +type: object +additionalProperties: false +patternProperties: + '^ocrd-.*': + type: array + items: + type: object + additionalProperties: false + required: + - url + - type + - name + - version_range + properties: + url: + type: string + description: URLs of all components of this resource + name: + type: string + description: Name to store the resource as + type: + type: string + enum: ['direct-link', 'github-file', 'github-dir', 'tarball-link', 'zip-link'] + description: Type of the URL + path_in_archive: + type: string + description: if type is tarball-link or zip-link, the resource is at this location in the archive + version_range: + type: string + description: Range of supported versions, syntax like in PEP 440 diff --git a/ocrd_validators/ocrd_validators/resource_list_validator.py b/ocrd_validators/ocrd_validators/resource_list_validator.py new file mode 100644 index 0000000000..aa782c431e --- /dev/null +++ b/ocrd_validators/ocrd_validators/resource_list_validator.py @@ -0,0 +1,24 @@ +""" +Validating ``resource_list.yml``. + +See `specs `_. +""" +from .constants import RESOURCE_LIST_SCHEMA +from .json_validator import JsonValidator + +# +# ------------------------------------------------- +# + +class OcrdResourceListValidator(JsonValidator): + """ + JsonValidator validating against the ``resource_list.yml`` schema. + """ + + @staticmethod + def validate(obj, schema=RESOURCE_LIST_SCHEMA): + """ + Validate against ``resource_list.yml`` schema. + """ + return JsonValidator.validate(obj, schema) + diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py new file mode 100644 index 0000000000..18e75ae99f --- /dev/null +++ b/tests/validator/test_resource_list_validator.py @@ -0,0 +1,27 @@ +import json + +from tests.base import TestCase, main # pylint: disable=import-error,no-name-in-module +from pytest import fixture + +from ocrd_validators import OcrdResourceListValidator + +@fixture +def reslist(): + return { + 'ocrd-foo': [ + { + 'url': 'https:/foo', + 'type': 'direct-link', + 'name': 'foo', + 'version_range': '>= 0.0.1' + } + ] + } + +def test_resource_list_validator(reslist): + report = OcrdResourceListValidator.validate(reslist) + print(report.errors) + assert report.is_valid == True + +if __name__ == '__main__': + main(__file__) From 5a6ccf33ee250947b38105188d95f93b4f7c204b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 21 Dec 2020 19:03:10 +0100 Subject: [PATCH 15/70] implement foundation of ocrd resmgr --- ocrd/ocrd/cli/__init__.py | 2 + ocrd/ocrd/cli/resmgr.py | 43 ++++++++++++++ ocrd/ocrd/constants.py | 2 + ocrd/ocrd/resource_list.yml | 20 +++++++ ocrd/ocrd/resource_manager.py | 58 +++++++++++++++++++ .../ocrd_validators/resource_list.schema.yml | 8 ++- 6 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 ocrd/ocrd/cli/resmgr.py create mode 100644 ocrd/ocrd/resource_list.yml create mode 100644 ocrd/ocrd/resource_manager.py diff --git a/ocrd/ocrd/cli/__init__.py b/ocrd/ocrd/cli/__init__.py index 1d1aeda263..5b3d077ddb 100644 --- a/ocrd/ocrd/cli/__init__.py +++ b/ocrd/ocrd/cli/__init__.py @@ -18,6 +18,7 @@ def get_help(self, ctx): from ocrd.cli.process import process_cli from ocrd.cli.bashlib import bashlib_cli from ocrd.cli.validate import validate_cli +from ocrd.cli.resmgr import resmgr_cli from ocrd.decorators import ocrd_loglevel from .zip import zip_cli from .log import log_cli @@ -37,3 +38,4 @@ def cli(**kwargs): # pylint: disable=unused-argument cli.add_command(zip_cli) cli.add_command(validate_cli) cli.add_command(log_cli) +cli.add_command(resmgr_cli) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py new file mode 100644 index 0000000000..9955fe7caf --- /dev/null +++ b/ocrd/ocrd/cli/resmgr.py @@ -0,0 +1,43 @@ +import sys + +import click + +from ocrd_utils import initLogging +from ocrd_validators import OcrdZipValidator + +from ..resource_manager import OcrdResourceManager + +@click.group("resmgr") +def resmgr_cli(): + """ + Managing processor resources + """ + initLogging() + +# ---------------------------------------------------------------------- +# ocrd zip list-available +# ---------------------------------------------------------------------- + +@resmgr_cli.command('list-available') +@click.option('-e', '--executable', help='Show only resources for executable EXEC', metavar='EXEC') +def list_available(executable=None): + """ + List available resources + """ + resmgr = OcrdResourceManager() + for executable, reslist in resmgr.list_available(executable): + print('%s' % executable) + for resdict in reslist: + print('- %s (%s)\n %s' % (resdict['name'], resdict['url'], resdict['description'])) + print() + +@resmgr_cli.command('list-installed') +@click.option('-e', '--executable', help='Show only resources for executable EXEC', metavar='EXEC') +def list_installed(executable=None): + """ + List installed resources + """ + resmgr = OcrdResourceManager() + ret = [] + for executable, reslist in resmgr.list_installed(executable): + print(executable, reslist) diff --git a/ocrd/ocrd/constants.py b/ocrd/ocrd/constants.py index e1456082a6..f82d2d3bdd 100644 --- a/ocrd/ocrd/constants.py +++ b/ocrd/ocrd/constants.py @@ -9,6 +9,7 @@ 'DOWNLOAD_DIR', 'DEFAULT_REPOSITORY_URL', 'BASHLIB_FILENAME', + 'RESOURCE_LIST_FILENAME', 'BACKUP_DIR', ] @@ -17,4 +18,5 @@ DOWNLOAD_DIR = '/tmp/ocrd-core-downloads' DEFAULT_REPOSITORY_URL = 'http://localhost:5000/' BASHLIB_FILENAME = resource_filename(__name__, 'lib.bash') +RESOURCE_LIST_FILENAME = resource_filename(__name__, 'resource_list.yml') BACKUP_DIR = '.backup' diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml new file mode 100644 index 0000000000..f778c80bdc --- /dev/null +++ b/ocrd/ocrd/resource_list.yml @@ -0,0 +1,20 @@ +# List available resources by processor for "ocrd resmgr" +ocrd-tesserocr-recognize: + - url: https://ub-backup.bib.uni-mannheim.de/~stweil/ocrd-train/data/Fraktur_5000000/tessdata_fast/Fraktur_50000000.334_450937.traineddata + type: file + name: Fraktur_GT4HistOCR.traineddata + description: Tesseract LSTM model trained on GT4HistOCR + version_range: '>= 0.0.1' +ocrd-calamari-recognize: + - url: https://qurator-data.de/calamari-models/GT4HistOCR/2019-07-22T15_49+0200/model.tar.xz + type: archive + name: qurator-gt4hist-0.3 + description: Calamari model trained with GT4HistOCR + path_in_archive: '.' + version_range: '< 1.0.0' + - url: https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz + type: archive + name: qurator-gt4hist-1.0 + description: Calamari model trained with GT4HistOCR + path_in_archive: '.' + version_range: '>= 1.0.0' diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py new file mode 100644 index 0000000000..6a2aba25a0 --- /dev/null +++ b/ocrd/ocrd/resource_manager.py @@ -0,0 +1,58 @@ +from pathlib import Path + +from yaml import safe_load + +from .constants import RESOURCE_LIST_FILENAME + +from ocrd_validators import OcrdResourceListValidator +from ocrd_utils import getLogger +from ocrd_utils.constants import HOME +from ocrd_utils.os import list_resource_candidates, list_all_resources + +builtin_list_filename = Path(RESOURCE_LIST_FILENAME) +user_list_filename = Path(HOME, 'ocrd', 'resources.yml') + +class OcrdResourceManager(): + + """ + Managing processor resources + """ + def __init__(self): + self.log = getLogger('ocrd.resource_manager') + self.database = {} + self.load_resource_list(builtin_list_filename) + self.load_resource_list(user_list_filename) + + def load_resource_list(self, list_filename): + if list_filename.is_file(): + with open(list_filename, 'r', encoding='utf-8') as f: + list_loaded = safe_load(f) + report = OcrdResourceListValidator.validate(list_loaded) + if not report.is_valid: + self.log.error('\n'.join(report.errors)) + raise ValueError("Resource list %s is invalid!" % (list_filename)) + for executable, resource_list in list_loaded.items(): + if executable not in self.database: + self.database[executable] = [] + # Prepend, so user provided is sorted before builtin + self.database[executable] = list_loaded[executable] + self.database[executable] + + def list_available(self, executable=None): + if executable: + resources = [(executable, self.database[executable])] + else: + resources = [(x, y) for x, y in self.database.items()] + return resources + + def list_installed(self, executable=None): + ret = [] + for executable in [executable] if executable else self.database.keys(): + reslist = [] + for res_filename in list_all_resources(executable): + res_name = Path(res_filename).name + resdict = [x for x in self.database[executable] if x['name'] == res_name] + if not resdict: + resdict = [{'name': res_name, 'url': '???', 'description': '???', 'version_range': '???'}] + reslist.append(resdict[0]) + ret.append((executable, reslist)) + return ret diff --git a/ocrd_validators/ocrd_validators/resource_list.schema.yml b/ocrd_validators/ocrd_validators/resource_list.schema.yml index 82d45aa875..32ce2b9142 100644 --- a/ocrd_validators/ocrd_validators/resource_list.schema.yml +++ b/ocrd_validators/ocrd_validators/resource_list.schema.yml @@ -8,6 +8,7 @@ patternProperties: additionalProperties: false required: - url + - description - type - name - version_range @@ -15,16 +16,19 @@ patternProperties: url: type: string description: URLs of all components of this resource + description: + type: string + description: A description of the resource name: type: string description: Name to store the resource as type: type: string - enum: ['direct-link', 'github-file', 'github-dir', 'tarball-link', 'zip-link'] + enum: ['file', 'github-file', 'github-dir', 'archive'] description: Type of the URL path_in_archive: type: string - description: if type is tarball-link or zip-link, the resource is at this location in the archive + description: if type is archive, the resource is at this location in the archive version_range: type: string description: Range of supported versions, syntax like in PEP 440 From 128f6b73ee85a425519cd2f8bc5e20d0a5242ff6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 21 Dec 2020 19:41:13 +0100 Subject: [PATCH 16/70] ocrd resmgr list-{installed,available} same output --- ocrd/ocrd/cli/resmgr.py | 17 ++++++++--------- ocrd/ocrd/resource_manager.py | 12 ++++++++---- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 9955fe7caf..a6e1d946cb 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -7,6 +7,12 @@ from ..resource_manager import OcrdResourceManager +def print_resources(executable, reslist): + print('%s' % executable) + for resdict in reslist: + print('- %s (%s)\n %s' % (resdict['name'], resdict['url'], resdict['description'])) + print() + @click.group("resmgr") def resmgr_cli(): """ @@ -14,10 +20,6 @@ def resmgr_cli(): """ initLogging() -# ---------------------------------------------------------------------- -# ocrd zip list-available -# ---------------------------------------------------------------------- - @resmgr_cli.command('list-available') @click.option('-e', '--executable', help='Show only resources for executable EXEC', metavar='EXEC') def list_available(executable=None): @@ -26,10 +28,7 @@ def list_available(executable=None): """ resmgr = OcrdResourceManager() for executable, reslist in resmgr.list_available(executable): - print('%s' % executable) - for resdict in reslist: - print('- %s (%s)\n %s' % (resdict['name'], resdict['url'], resdict['description'])) - print() + print_resources(executable, reslist) @resmgr_cli.command('list-installed') @click.option('-e', '--executable', help='Show only resources for executable EXEC', metavar='EXEC') @@ -40,4 +39,4 @@ def list_installed(executable=None): resmgr = OcrdResourceManager() ret = [] for executable, reslist in resmgr.list_installed(executable): - print(executable, reslist) + print_resources(executable, reslist) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 6a2aba25a0..13716ac991 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -38,13 +38,17 @@ def load_resource_list(self, list_filename): self.database[executable] = list_loaded[executable] + self.database[executable] def list_available(self, executable=None): + """ + List models available for download by processor + """ if executable: - resources = [(executable, self.database[executable])] - else: - resources = [(x, y) for x, y in self.database.items()] - return resources + return [(executable, self.database[executable])] + return [(x, y) for x, y in self.database.items()] def list_installed(self, executable=None): + """ + List installed resources, matching with registry by ``name`` + """ ret = [] for executable in [executable] if executable else self.database.keys(): reslist = [] From 0f42da6eb5c61ae300fcb365419805a237481814 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 15:08:32 +0100 Subject: [PATCH 17/70] resmgr: basic downloading of urls of files --- ocrd/ocrd/cli/resmgr.py | 58 ++++++++++++++++++++++++++++++++++- ocrd/ocrd/processor/base.py | 10 ------ ocrd/ocrd/resource_manager.py | 44 +++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 12 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index a6e1d946cb..18355365e9 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -1,8 +1,16 @@ import sys +from os import getcwd +from pathlib import Path import click -from ocrd_utils import initLogging +from ocrd_utils import ( + initLogging, + getLogger, + XDG_CACHE_HOME, + XDG_CONFIG_HOME, + XDG_DATA_HOME +) from ocrd_validators import OcrdZipValidator from ..resource_manager import OcrdResourceManager @@ -40,3 +48,51 @@ def list_installed(executable=None): ret = [] for executable, reslist in resmgr.list_installed(executable): print_resources(executable, reslist) + +@resmgr_cli.command('download') +@click.option('-n', '--any-url', help='Allow downloading unregistered resources', is_flag=True) +@click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) +@click.option('-l', '--location', help='Where to store resources', type=click.Choice(['cache', 'config', 'data', 'cwd']), default='cache', show_default=True) +@click.argument('executable', required=True) +@click.argument('url_or_name', required=True) +def download(any_url, overwrite, location, executable, url_or_name): + """ + Download resource URL_OR_NAME for processor EXECUTABLE. + + URL_OR_NAME can either be the ``name`` or ``url`` of a registered resource. + + If ``--any-url`` is given, also accepts URL of non-registered resources for ``URL_OR_NAME``. + """ + log = getLogger('ocrd.cli.resmgr') + resmgr = OcrdResourceManager() + basedir = XDG_CACHE_HOME if location == 'cache' else \ + XDG_DATA_HOME if location == 'data' else \ + XDG_CONFIG_HOME if location == 'config' else \ + getcwd() + is_url = url_or_name.startswith('https://') or url_or_name.startswith('http://') + find_kwargs = {'executable': executable} + find_kwargs['url' if is_url else 'name'] = url_or_name + reslist = resmgr.find_resources(**find_kwargs) + if not reslist: + log.info("No resources found in registry") + if is_url and any_url: + log.info("Downloading unregistered resource %s" % url_or_name) + fpath = resmgr.download(executable, url_or_name, overwrite=overwrite, basedir=basedir) + log.info("Downloaded %s to %s" % (url_or_name, fpath)) + log.info("Use in parameters as '%s'" % fpath.name) + else: + sys.exit(1) + else: + for _, resdict in reslist: + fpath = resmgr.download( + executable, + resdict['url'], + name=resdict['name'], + type=resdict['type'], + path_in_archive=resdict.get('path_in_archive', '.'), + overwrite=overwrite, + basedir=basedir + ) + log.info("Downloaded %s to %s" % (resdict['url'], fpath)) + log.info("Use in parameters as '%s'" % fpath.name) + diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 9f2a9345f5..1902be2f22 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -164,16 +164,6 @@ def resolve_resource(self, parameter_name, val): raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" % parameter_name) if val.startswith('http:') or val.startswith('https:'): - cache_dir = join(XDG_CACHE_HOME, executable) - cache_key = re.sub('[^A-Za-z0-9]', '', val) - cache_fpath = join(cache_dir, cache_key) - # TODO Proper caching (make head request for size, If-Modified etc) - if not exists(cache_fpath): - if not isdir(cache_dir): - makedirs(cache_dir) - with requests.get(val, stream=True) as r: - with open(cache_fpath, 'wb') as f: - copyfileobj(r.raw, f) return cache_fpath ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)]) if ret: diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 13716ac991..14924ab532 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -1,12 +1,15 @@ from pathlib import Path +import re +from shutil import copyfileobj +import requests from yaml import safe_load from .constants import RESOURCE_LIST_FILENAME from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger -from ocrd_utils.constants import HOME +from ocrd_utils.constants import HOME, XDG_CACHE_HOME from ocrd_utils.os import list_resource_candidates, list_all_resources builtin_list_filename = Path(RESOURCE_LIST_FILENAME) @@ -56,7 +59,46 @@ def list_installed(self, executable=None): res_name = Path(res_filename).name resdict = [x for x in self.database[executable] if x['name'] == res_name] if not resdict: + # TODO handle gracefully resdict = [{'name': res_name, 'url': '???', 'description': '???', 'version_range': '???'}] reslist.append(resdict[0]) ret.append((executable, reslist)) return ret + + def find_resources(self, executable=None, name=None, url=None): + """ + Find resources in the registry + """ + ret = [] + if executable and executable not in self.database.keys(): + return ret + for executable in [executable] if executable else self.database.keys(): + for resdict in self.database[executable]: + if url and url == resdict['url']: + ret.append((executable, resdict)) + elif name and name == resdict['name']: + ret.append((executable, resdict)) + return ret + + # TODO Proper caching (make head request for size, If-Modified etc) + def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, name=None, type='file', path_in_archive='.'): + """ + Download a resource by URL + """ + log = getLogger('ocrd.resource_manager.download') + destdir = Path(basedir, executable) + if not name: + name = re.sub('[^A-Za-z0-9]', '', url) + fpath = Path(destdir, name) + if fpath.exists() and not overwrite: + log.info("%s to be downloaded to %s which already exists and overwrite is False" % (url, fpath)) + return fpath + destdir.mkdir(parents=True, exist_ok=True) + if type == 'file': + with requests.get(url, stream=True) as r: + with open(fpath, 'wb') as f: + copyfileobj(r.raw, f) + # TODO + # elif type == 'archive': + # elif type == 'github-dir': + # elif type == 'github-file': From b63b4d8a75c2c1c4deb69361868b92928ca16ac4 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 17:10:56 +0100 Subject: [PATCH 18/70] resmgr: support parameter_usage different from resource name --- ocrd/ocrd/cli/resmgr.py | 2 +- ocrd/ocrd/resource_list.yml | 1 + ocrd/ocrd/resource_manager.py | 6 ++++++ ocrd_validators/ocrd_validators/resource_list.schema.yml | 5 +++++ ocrd_validators/ocrd_validators/resource_list_validator.py | 6 +++--- 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 18355365e9..eea4bfd7a8 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -94,5 +94,5 @@ def download(any_url, overwrite, location, executable, url_or_name): basedir=basedir ) log.info("Downloaded %s to %s" % (resdict['url'], fpath)) - log.info("Use in parameters as '%s'" % fpath.name) + log.info("Use in parameters as '%s'" % resmgr.parameter_usage(resdict['name'], usage=resdict['parameter_usage'])) diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index f778c80bdc..936d4e5167 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -3,6 +3,7 @@ ocrd-tesserocr-recognize: - url: https://ub-backup.bib.uni-mannheim.de/~stweil/ocrd-train/data/Fraktur_5000000/tessdata_fast/Fraktur_50000000.334_450937.traineddata type: file name: Fraktur_GT4HistOCR.traineddata + parameter_usage: 'without-extension' description: Tesseract LSTM model trained on GT4HistOCR version_range: '>= 0.0.1' ocrd-calamari-recognize: diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 14924ab532..254164c528 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -80,6 +80,12 @@ def find_resources(self, executable=None, name=None, url=None): ret.append((executable, resdict)) return ret + def parameter_usage(self, name, usage='as-is'): + if usage == 'as-is': + return name + if usage == 'without-extension': + return Path(name).stem + # TODO Proper caching (make head request for size, If-Modified etc) def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, name=None, type='file', path_in_archive='.'): """ diff --git a/ocrd_validators/ocrd_validators/resource_list.schema.yml b/ocrd_validators/ocrd_validators/resource_list.schema.yml index 32ce2b9142..3773641ef6 100644 --- a/ocrd_validators/ocrd_validators/resource_list.schema.yml +++ b/ocrd_validators/ocrd_validators/resource_list.schema.yml @@ -26,6 +26,11 @@ patternProperties: type: string enum: ['file', 'github-file', 'github-dir', 'archive'] description: Type of the URL + parameter_usage: + type: string + description: Defines how the parameter is to be used + enum: ['as-is', 'without-extension'] + default: 'as-is' path_in_archive: type: string description: if type is archive, the resource is at this location in the archive diff --git a/ocrd_validators/ocrd_validators/resource_list_validator.py b/ocrd_validators/ocrd_validators/resource_list_validator.py index aa782c431e..ab1b53a2f6 100644 --- a/ocrd_validators/ocrd_validators/resource_list_validator.py +++ b/ocrd_validators/ocrd_validators/resource_list_validator.py @@ -4,7 +4,7 @@ See `specs `_. """ from .constants import RESOURCE_LIST_SCHEMA -from .json_validator import JsonValidator +from .json_validator import JsonValidator, DefaultValidatingDraft4Validator # # ------------------------------------------------- @@ -18,7 +18,7 @@ class OcrdResourceListValidator(JsonValidator): @staticmethod def validate(obj, schema=RESOURCE_LIST_SCHEMA): """ - Validate against ``resource_list.yml`` schema. + Validate against ``resource_list.schema.yml`` schema. """ - return JsonValidator.validate(obj, schema) + return JsonValidator(schema, validator_class=DefaultValidatingDraft4Validator)._validate(obj) From 3f0eeacdfde900e1f3b87bd03cd7dd61466c21a7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 17:32:46 +0100 Subject: [PATCH 19/70] add more models to resource_list.yml --- ocrd/ocrd/resource_list.yml | 58 ++++++++++++++++++- .../ocrd_validators/resource_list.schema.yml | 5 +- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index 936d4e5167..4a7183a85b 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -1,11 +1,37 @@ # List available resources by processor for "ocrd resmgr" ocrd-tesserocr-recognize: - url: https://ub-backup.bib.uni-mannheim.de/~stweil/ocrd-train/data/Fraktur_5000000/tessdata_fast/Fraktur_50000000.334_450937.traineddata - type: file name: Fraktur_GT4HistOCR.traineddata parameter_usage: 'without-extension' description: Tesseract LSTM model trained on GT4HistOCR - version_range: '>= 0.0.1' + - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/equ.traineddata + name: equ.traineddata + parameter_usage: 'without-extension' + description: Tesseract equ model + - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/osd.traineddata + name: osd.traineddata + parameter_usage: 'without-extension' + description: Tesseract osd model + - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata + name: eng.traineddata + parameter_usage: 'without-extension' + description: Tesseract eng model + - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/deu.traineddata + name: deu.traineddata + parameter_usage: 'without-extension' + description: Tesseract deu model + - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/frk.traineddata + name: frk.traineddata + parameter_usage: 'without-extension' + description: Tesseract frk model + - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/script/Fraktur.traineddata + name: Fraktur.traineddata + parameter_usage: 'without-extension' + description: Tesseract Fraktur model + - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/script/Latin.traineddata + name: Latin.traineddata + parameter_usage: 'without-extension' + description: Tesseract Latin model ocrd-calamari-recognize: - url: https://qurator-data.de/calamari-models/GT4HistOCR/2019-07-22T15_49+0200/model.tar.xz type: archive @@ -19,3 +45,31 @@ ocrd-calamari-recognize: description: Calamari model trained with GT4HistOCR path_in_archive: '.' version_range: '>= 1.0.0' +ocrd-ocropy-recognize: + - url: https://github.com/zuphilip/ocropy-models/raw/master/en-default.pyrnn.gz + name: en-default.pyrnn.gz + description: Default ocropy model + - url: https://github.com/zuphilip/ocropy-models/raw/master/fraktur.pyrnn.gz + name: fraktur.pyrnn.gz + description: Default ocropy fraktur model + - url: https://github.com/jze/ocropus-model_fraktur/raw/master/fraktur.pyrnn.gz + name: fraktur-jze.pyrnn.gz + description: ocropy fraktur model by github.com/jze + - url: https://github.com/chreul/OCR_Testdata_EarlyPrintedBooks/raw/master/LatinHist-98000.pyrnn.gz + name: LatinHist.pyrnn.gz + description: ocropy historical latin model by github.com/chreul +ocrd-typegroups-classifier: + - url: https://github.com/seuretm/ocrd_typegroups_classifier/raw/master/ocrd_typegroups_classifier/models/densenet121.tgc + name: densenet121.tgc + description: Network to predict font families with. Bundled with standard installation. +ocrd-sbb-binarize: + - url: https://qurator-data.de/sbb_binarization/models.tar.gz + description: default models provided by github.com/qurator-spk + name: default + type: archive + path_in_archive: models +ocrd-sbb-textline-detector: + - url: https://qurator-data.de/sbb_textline_detector/models.tar.gz + description: default models provided by github.com/qurator-spk + name: default + type: archive diff --git a/ocrd_validators/ocrd_validators/resource_list.schema.yml b/ocrd_validators/ocrd_validators/resource_list.schema.yml index 3773641ef6..216e0dbec3 100644 --- a/ocrd_validators/ocrd_validators/resource_list.schema.yml +++ b/ocrd_validators/ocrd_validators/resource_list.schema.yml @@ -9,9 +9,7 @@ patternProperties: required: - url - description - - type - name - - version_range properties: url: type: string @@ -25,6 +23,7 @@ patternProperties: type: type: string enum: ['file', 'github-file', 'github-dir', 'archive'] + default: file description: Type of the URL parameter_usage: type: string @@ -34,6 +33,8 @@ patternProperties: path_in_archive: type: string description: if type is archive, the resource is at this location in the archive + default: '.' version_range: type: string description: Range of supported versions, syntax like in PEP 440 + default: '>= 0.0.1' From 6e6e424e02ecf6138b907e1fdfa9cb54fad256d8 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 17:55:34 +0100 Subject: [PATCH 20/70] resmgr: simplify resource typing --- ocrd/ocrd/cli/resmgr.py | 2 +- ocrd/ocrd/resource_list.yml | 8 ++++---- ocrd/ocrd/resource_manager.py | 11 ++++++----- .../ocrd_validators/resource_list.schema.yml | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index eea4bfd7a8..c83e87f29f 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -88,7 +88,7 @@ def download(any_url, overwrite, location, executable, url_or_name): executable, resdict['url'], name=resdict['name'], - type=resdict['type'], + resource_type=resdict['type'], path_in_archive=resdict.get('path_in_archive', '.'), overwrite=overwrite, basedir=basedir diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index 4a7183a85b..331be19ace 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -34,13 +34,13 @@ ocrd-tesserocr-recognize: description: Tesseract Latin model ocrd-calamari-recognize: - url: https://qurator-data.de/calamari-models/GT4HistOCR/2019-07-22T15_49+0200/model.tar.xz - type: archive + type: tarball name: qurator-gt4hist-0.3 description: Calamari model trained with GT4HistOCR path_in_archive: '.' version_range: '< 1.0.0' - url: https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz - type: archive + type: tarball name: qurator-gt4hist-1.0 description: Calamari model trained with GT4HistOCR path_in_archive: '.' @@ -66,10 +66,10 @@ ocrd-sbb-binarize: - url: https://qurator-data.de/sbb_binarization/models.tar.gz description: default models provided by github.com/qurator-spk name: default - type: archive + type: tarball path_in_archive: models ocrd-sbb-textline-detector: - url: https://qurator-data.de/sbb_textline_detector/models.tar.gz description: default models provided by github.com/qurator-spk name: default - type: archive + type: tarball diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 254164c528..c31b0dd1dc 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -87,7 +87,7 @@ def parameter_usage(self, name, usage='as-is'): return Path(name).stem # TODO Proper caching (make head request for size, If-Modified etc) - def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, name=None, type='file', path_in_archive='.'): + def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, name=None, resource_type='file', path_in_archive='.'): """ Download a resource by URL """ @@ -100,11 +100,12 @@ def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, nam log.info("%s to be downloaded to %s which already exists and overwrite is False" % (url, fpath)) return fpath destdir.mkdir(parents=True, exist_ok=True) - if type == 'file': + if resource_type == 'file': with requests.get(url, stream=True) as r: with open(fpath, 'wb') as f: copyfileobj(r.raw, f) + # elif resource_type == archive: # TODO - # elif type == 'archive': - # elif type == 'github-dir': - # elif type == 'github-file': + # elif resource_type == 'archive': + # elif resource_type == 'github-dir': + # elif resource_type == 'github-file': diff --git a/ocrd_validators/ocrd_validators/resource_list.schema.yml b/ocrd_validators/ocrd_validators/resource_list.schema.yml index 216e0dbec3..abed0ad1e8 100644 --- a/ocrd_validators/ocrd_validators/resource_list.schema.yml +++ b/ocrd_validators/ocrd_validators/resource_list.schema.yml @@ -22,7 +22,7 @@ patternProperties: description: Name to store the resource as type: type: string - enum: ['file', 'github-file', 'github-dir', 'archive'] + enum: ['file', 'github-dir', 'tarball'] default: file description: Type of the URL parameter_usage: From 5843e1be0b45ee966e313f7532cb8d79a2f0e566 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 18:17:55 +0100 Subject: [PATCH 21/70] resmgr: support tarball downloads --- ocrd/ocrd/resource_manager.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index c31b0dd1dc..55398d0f2f 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -1,6 +1,8 @@ from pathlib import Path import re -from shutil import copyfileobj +from shutil import copyfileobj, copytree +from tempfile import TemporaryFile +from tarfile import open as open_tarfile import requests from yaml import safe_load @@ -10,7 +12,7 @@ from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger from ocrd_utils.constants import HOME, XDG_CACHE_HOME -from ocrd_utils.os import list_resource_candidates, list_all_resources +from ocrd_utils.os import list_resource_candidates, list_all_resources, pushd_popd builtin_list_filename = Path(RESOURCE_LIST_FILENAME) user_list_filename = Path(HOME, 'ocrd', 'resources.yml') @@ -104,8 +106,18 @@ def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, nam with requests.get(url, stream=True) as r: with open(fpath, 'wb') as f: copyfileobj(r.raw, f) - # elif resource_type == archive: + elif resource_type == 'tarball': + with pushd_popd(tempdir=True): + log.info("Downloading %s" % url) + with open('download.tar.xx', 'wb') as f_write_tar: + with requests.get(url, stream=True) as r: + copyfileobj(r.raw, f_write_tar) + Path('out').mkdir() + with pushd_popd('out'): + log.info("Extracting tarball") + with open_tarfile('../download.tar.xx', 'r:*') as tar: + tar.extractall() + log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath)) + copytree(path_in_archive, str(fpath)) # TODO - # elif resource_type == 'archive': # elif resource_type == 'github-dir': - # elif resource_type == 'github-file': From 0edac7029598c1b70c97d3f698bfb8dadc223a6d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 18:25:23 +0100 Subject: [PATCH 22/70] search for resources only on top-level --- ocrd_utils/ocrd_utils/os.py | 11 ++++------- tests/validator/test_resource_list_validator.py | 3 ++- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index c388b1d6e2..64138dfead 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -11,7 +11,7 @@ from tempfile import TemporaryDirectory import contextlib -from os import getcwd, chdir, stat, chmod, umask, environ, walk +from os import getcwd, chdir, stat, chmod, umask, environ, scandir from pathlib import Path from os.path import exists, abspath as abspath_, join, isdir from zipfile import ZipFile @@ -93,17 +93,14 @@ def list_all_resources(executable): if processor_path_var in environ: for processor_path in environ[processor_path_var].split(':'): if isdir(processor_path): - for root, dirs, files in walk(processor_path): - candidates += files + candidates += list(scandir(processor_path)) if 'VIRTUAL_ENV' in environ: sharedir = join(environ['VIRTUAL_ENV'], 'share', executable) if isdir(sharedir): - for root, dirs, files in walk(sharedir): - candidates += files + candidates += list(scandir(sharedir)) for xdgdir in [join(d, executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME]]: if isdir(xdgdir): - for root, dirs, files in walk(xdgdir): - candidates += files + candidates += list(scandir(xdgdir)) return candidates # ht @pabs3 diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py index 18e75ae99f..a7bfe3e33e 100644 --- a/tests/validator/test_resource_list_validator.py +++ b/tests/validator/test_resource_list_validator.py @@ -11,7 +11,8 @@ def reslist(): 'ocrd-foo': [ { 'url': 'https:/foo', - 'type': 'direct-link', + 'type': 'file', + 'description': 'something descriptive', 'name': 'foo', 'version_range': '>= 0.0.1' } From f96ce5e8902b3de36ed500bb2cadc0a9ceac2b96 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 18:39:57 +0100 Subject: [PATCH 23/70] use resmgr in Processor.resolve_resource --- ocrd/ocrd/processor/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 1902be2f22..424a27bea9 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -31,6 +31,7 @@ ) from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType +from ..resource_manager import OcrdResourceManager # XXX imports must remain for backwards-compatibilty from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import @@ -164,7 +165,7 @@ def resolve_resource(self, parameter_name, val): raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" % parameter_name) if val.startswith('http:') or val.startswith('https:'): - return cache_fpath + return OcrdResourceManager().download(executable, val) ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)]) if ret: return ret From fa90f1bf1d8305c15ed73612f012842bc66654b9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 22 Dec 2020 19:10:14 +0100 Subject: [PATCH 24/70] simplify Processor.resolve_resource, delegate to resmgr as much as possible --- ocrd/ocrd/processor/base.py | 37 ++++++++++++++++++----------------- ocrd/ocrd/resource_manager.py | 1 + 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 424a27bea9..90245aa465 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -147,33 +147,34 @@ def add_metadata(self, pcgts): value=OCRD_VERSION)]) ])) - def resolve_resource(self, parameter_name, val): + def resolve_resource(self, val): """ Resolve a resource name to an absolute file path with the algorithm in https://ocr-d.de/en/spec/ocrd_tool#file-parameters Args: - parameter_name (string): name of parameter to resolve resource for val (string): resource value to resolve """ executable = self.ocrd_tool['executable'] - try: - param = self.ocrd_tool['parameter'][parameter_name] - except KeyError: - raise ValueError("Parameter '%s' not defined in ocrd-tool.json" % parameter_name) - if not param['mimetype']: - raise ValueError("Parameter '%s' is not a file parameter (has no 'mimetype' field)" % - parameter_name) - if val.startswith('http:') or val.startswith('https:'): - return OcrdResourceManager().download(executable, val) - ret = next([cand for cand in list_resource_candidates(executable, val) if exists(cand)]) + if exists(val): + return val + ret = [cand for cand in list_resource_candidates(executable, val) if exists(cand)] if ret: - return ret - bundled_fpath = resource_filename(__name__, val) - if exists(bundled_fpath): - return bundled_fpath - raise FileNotFoundError("Could not resolve '%s' file parameter value '%s'" % - (parameter_name, val)) + return ret[0] + resmgr = OcrdResourceManager() + reslist = resmgr.find_resources(executable, name=val) + if not reslist: + reslist = resmgr.find_resources(executable, url=val) + if not reslist: + raise FileNotFoundError("Could not resolve '%s'" % val) + _, resdict = reslist[0] + return str(resmgr.download( + executable, + url=resdict['url'], + name=resdict['name'], + path_in_archive=resdict['path_in_archive'], + resource_type=resdict['type'] + )) def list_all_resources(self): """ diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 55398d0f2f..ba232bcdce 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -121,3 +121,4 @@ def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, nam copytree(path_in_archive, str(fpath)) # TODO # elif resource_type == 'github-dir': + return fpath From 89f77f01bc714bfc1223901fad1df757c662ca81 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Dec 2020 11:17:13 +0100 Subject: [PATCH 25/70] resmgr: add anybaseocr resources --- ocrd/ocrd/resource_list.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index 331be19ace..035da9e4da 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -73,3 +73,22 @@ ocrd-sbb-textline-detector: description: default models provided by github.com/qurator-spk name: default type: tarball +ocrd-anybaseocr-dewarp: + - url: https://ocr-d-repo.scc.kit.edu/models/dfki/dewarping/latest_net_G.pth + name: latest_net_G.pth + description: dewarping model for anybaseocr +ocrd-anybaseocr-block-segmentation: + - url: https://ocr-d-repo.scc.kit.edu/models/dfki/segmentation/block_segmentation_weights.h5 + name: block_segmentation_weights.h5 + description: block segmentation model for anybaseocr +ocrd-anybaseocr-layout-analysis: + - url: https://ocr-d-repo.scc.kit.edu/models/dfki/layoutAnalysis/structure_analysis.h5 + name: structure_analysis.h5 + description: structure analysis model for anybaseocr + - url: https://ocr-d-repo.scc.kit.edu/models/dfki/layoutAnalysis/mapping_densenet.pickle + name: mapping_densenet.pickle + description: mapping model for anybaseocr +ocrd-anybaseocr-tiseg: + - url: https://ocr-d-repo.scc.kit.edu/models/dfki/tiseg/seg_model.hdf5 + name: seg_model.hdf5 + description: text image segmentation model for anybaseocr From 18009a7b5dc5672a3e04f91a5bb35cfc543d79f7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Dec 2020 12:13:24 +0100 Subject: [PATCH 26/70] resmgr download: show progressbar, add size to resource list --- ocrd/ocrd/cli/resmgr.py | 32 +++++++++++++------ ocrd/ocrd/resource_list.yml | 22 +++++++++++++ ocrd/ocrd/resource_manager.py | 30 +++++++++++++---- .../ocrd_validators/resource_list.schema.yml | 4 +++ 4 files changed, 71 insertions(+), 17 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index c83e87f29f..1c8387f890 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -77,22 +77,34 @@ def download(any_url, overwrite, location, executable, url_or_name): log.info("No resources found in registry") if is_url and any_url: log.info("Downloading unregistered resource %s" % url_or_name) - fpath = resmgr.download(executable, url_or_name, overwrite=overwrite, basedir=basedir) + with requests.head(url_or_name) as r: + content_length = int(r.headers.get('content-length')) + with click.progressbar(length=content_length) as bar: + fpath = resmgr.download( + executable, + url_or_name, + overwrite=overwrite, + basedir=basedir, + progress_cb=lambda delta: bar.update(delta) + ) log.info("Downloaded %s to %s" % (url_or_name, fpath)) log.info("Use in parameters as '%s'" % fpath.name) else: sys.exit(1) else: for _, resdict in reslist: - fpath = resmgr.download( - executable, - resdict['url'], - name=resdict['name'], - resource_type=resdict['type'], - path_in_archive=resdict.get('path_in_archive', '.'), - overwrite=overwrite, - basedir=basedir - ) + log.info("Downloading resource %s" % resdict) + with click.progressbar(length=resdict['size']) as bar: + fpath = resmgr.download( + executable, + resdict['url'], + name=resdict['name'], + resource_type=resdict['type'], + path_in_archive=resdict.get('path_in_archive', '.'), + overwrite=overwrite, + basedir=basedir, + progress_cb=lambda delta: bar.update(delta) + ) log.info("Downloaded %s to %s" % (resdict['url'], fpath)) log.info("Use in parameters as '%s'" % resmgr.parameter_usage(resdict['name'], usage=resdict['parameter_usage'])) diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index 035da9e4da..053ec12880 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -4,91 +4,113 @@ ocrd-tesserocr-recognize: name: Fraktur_GT4HistOCR.traineddata parameter_usage: 'without-extension' description: Tesseract LSTM model trained on GT4HistOCR + size: 1058487 - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/equ.traineddata name: equ.traineddata parameter_usage: 'without-extension' description: Tesseract equ model + size: 2251950 - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/osd.traineddata name: osd.traineddata parameter_usage: 'without-extension' description: Tesseract osd model + size: 10562727 - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata name: eng.traineddata parameter_usage: 'without-extension' description: Tesseract eng model + size: 4113088 - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/deu.traineddata name: deu.traineddata parameter_usage: 'without-extension' description: Tesseract deu model + size: 1525436 - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/frk.traineddata name: frk.traineddata parameter_usage: 'without-extension' description: Tesseract frk model + size: 6423052 - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/script/Fraktur.traineddata name: Fraktur.traineddata parameter_usage: 'without-extension' description: Tesseract Fraktur model + size: 10915632 - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/script/Latin.traineddata name: Latin.traineddata parameter_usage: 'without-extension' description: Tesseract Latin model + size: 89384811 ocrd-calamari-recognize: - url: https://qurator-data.de/calamari-models/GT4HistOCR/2019-07-22T15_49+0200/model.tar.xz type: tarball name: qurator-gt4hist-0.3 description: Calamari model trained with GT4HistOCR + size: 116439072 path_in_archive: '.' version_range: '< 1.0.0' - url: https://qurator-data.de/calamari-models/GT4HistOCR/2019-12-11T11_10+0100/model.tar.xz type: tarball name: qurator-gt4hist-1.0 description: Calamari model trained with GT4HistOCR + size: 90275264 path_in_archive: '.' version_range: '>= 1.0.0' ocrd-ocropy-recognize: - url: https://github.com/zuphilip/ocropy-models/raw/master/en-default.pyrnn.gz name: en-default.pyrnn.gz description: Default ocropy model + size: 83826134 - url: https://github.com/zuphilip/ocropy-models/raw/master/fraktur.pyrnn.gz name: fraktur.pyrnn.gz description: Default ocropy fraktur model + size: 43882365 - url: https://github.com/jze/ocropus-model_fraktur/raw/master/fraktur.pyrnn.gz name: fraktur-jze.pyrnn.gz description: ocropy fraktur model by github.com/jze + size: 2961298 - url: https://github.com/chreul/OCR_Testdata_EarlyPrintedBooks/raw/master/LatinHist-98000.pyrnn.gz name: LatinHist.pyrnn.gz description: ocropy historical latin model by github.com/chreul + size: 16989864 ocrd-typegroups-classifier: - url: https://github.com/seuretm/ocrd_typegroups_classifier/raw/master/ocrd_typegroups_classifier/models/densenet121.tgc name: densenet121.tgc description: Network to predict font families with. Bundled with standard installation. + size: 28509377 ocrd-sbb-binarize: - url: https://qurator-data.de/sbb_binarization/models.tar.gz description: default models provided by github.com/qurator-spk name: default type: tarball path_in_archive: models + size: 1654623597 ocrd-sbb-textline-detector: - url: https://qurator-data.de/sbb_textline_detector/models.tar.gz description: default models provided by github.com/qurator-spk name: default type: tarball + size: 1194551551 ocrd-anybaseocr-dewarp: - url: https://ocr-d-repo.scc.kit.edu/models/dfki/dewarping/latest_net_G.pth name: latest_net_G.pth description: dewarping model for anybaseocr + size: 805292230 ocrd-anybaseocr-block-segmentation: - url: https://ocr-d-repo.scc.kit.edu/models/dfki/segmentation/block_segmentation_weights.h5 name: block_segmentation_weights.h5 description: block segmentation model for anybaseocr + size: 256139800 ocrd-anybaseocr-layout-analysis: - url: https://ocr-d-repo.scc.kit.edu/models/dfki/layoutAnalysis/structure_analysis.h5 name: structure_analysis.h5 description: structure analysis model for anybaseocr + size: 31477056 - url: https://ocr-d-repo.scc.kit.edu/models/dfki/layoutAnalysis/mapping_densenet.pickle name: mapping_densenet.pickle description: mapping model for anybaseocr + size: 374 ocrd-anybaseocr-tiseg: - url: https://ocr-d-repo.scc.kit.edu/models/dfki/tiseg/seg_model.hdf5 name: seg_model.hdf5 description: text image segmentation model for anybaseocr + size: 66080688 diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index ba232bcdce..a91cad3992 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -88,8 +88,28 @@ def parameter_usage(self, name, usage='as-is'): if usage == 'without-extension': return Path(name).stem + def _download_impl(self, url, filename, progress_cb=None): + with open(filename, 'wb') as f: + with requests.get(url, stream=True) as r: + total = int(r.headers.get('content-length')) + # copyfileobj(r.raw, f_write_tar) + for data in r.iter_content(chunk_size=4096): + if progress_cb: + progress_cb(len(data)) + f.write(data) + # TODO Proper caching (make head request for size, If-Modified etc) - def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, name=None, resource_type='file', path_in_archive='.'): + def download( + self, + executable, + url, + overwrite=False, + basedir=XDG_CACHE_HOME, + name=None, + resource_type='file', + path_in_archive='.', + progress_cb=None, + ): """ Download a resource by URL """ @@ -103,15 +123,11 @@ def download(self, executable, url, overwrite=False, basedir=XDG_CACHE_HOME, nam return fpath destdir.mkdir(parents=True, exist_ok=True) if resource_type == 'file': - with requests.get(url, stream=True) as r: - with open(fpath, 'wb') as f: - copyfileobj(r.raw, f) + self._download_impl(url, fpath, progress_cb) elif resource_type == 'tarball': with pushd_popd(tempdir=True): log.info("Downloading %s" % url) - with open('download.tar.xx', 'wb') as f_write_tar: - with requests.get(url, stream=True) as r: - copyfileobj(r.raw, f_write_tar) + self._download_impl(url, 'download.tar.xx', progress_cb) Path('out').mkdir() with pushd_popd('out'): log.info("Extracting tarball") diff --git a/ocrd_validators/ocrd_validators/resource_list.schema.yml b/ocrd_validators/ocrd_validators/resource_list.schema.yml index abed0ad1e8..5afb0531a8 100644 --- a/ocrd_validators/ocrd_validators/resource_list.schema.yml +++ b/ocrd_validators/ocrd_validators/resource_list.schema.yml @@ -10,6 +10,7 @@ patternProperties: - url - description - name + - size properties: url: type: string @@ -38,3 +39,6 @@ patternProperties: type: string description: Range of supported versions, syntax like in PEP 440 default: '>= 0.0.1' + size: + type: number + description: Size of the resource in bytes From 2df4c227bd7d21a93767b4bb89862bdfe365f734 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Dec 2020 12:30:11 +0100 Subject: [PATCH 27/70] fix resmgr test --- tests/validator/test_resource_list_validator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py index a7bfe3e33e..eb95d9b1ea 100644 --- a/tests/validator/test_resource_list_validator.py +++ b/tests/validator/test_resource_list_validator.py @@ -12,6 +12,7 @@ def reslist(): { 'url': 'https:/foo', 'type': 'file', + 'size': 123, 'description': 'something descriptive', 'name': 'foo', 'version_range': '>= 0.0.1' From e8d0e0fd9e4d033267894d5d87d7ea55b9cc29e9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Dec 2020 12:34:37 +0100 Subject: [PATCH 28/70] resmgr download: * to download all resources for this model --- ocrd/ocrd/cli/resmgr.py | 6 +++++- ocrd/ocrd/resource_manager.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 1c8387f890..bc82e14f6d 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -1,6 +1,7 @@ import sys from os import getcwd from pathlib import Path +import requests import click @@ -61,6 +62,8 @@ def download(any_url, overwrite, location, executable, url_or_name): URL_OR_NAME can either be the ``name`` or ``url`` of a registered resource. + If URL_OR_NAME is '*' (asterisk), download all known resources for this processor + If ``--any-url`` is given, also accepts URL of non-registered resources for ``URL_OR_NAME``. """ log = getLogger('ocrd.cli.resmgr') @@ -71,7 +74,8 @@ def download(any_url, overwrite, location, executable, url_or_name): getcwd() is_url = url_or_name.startswith('https://') or url_or_name.startswith('http://') find_kwargs = {'executable': executable} - find_kwargs['url' if is_url else 'name'] = url_or_name + if url_or_name != '*': + find_kwargs['url' if is_url else 'name'] = url_or_name reslist = resmgr.find_resources(**find_kwargs) if not reslist: log.info("No resources found in registry") diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index a91cad3992..e1060ea1b9 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -76,7 +76,9 @@ def find_resources(self, executable=None, name=None, url=None): return ret for executable in [executable] if executable else self.database.keys(): for resdict in self.database[executable]: - if url and url == resdict['url']: + if not name and not url: + ret.append((executable, resdict)) + elif url and url == resdict['url']: ret.append((executable, resdict)) elif name and name == resdict['name']: ret.append((executable, resdict)) From 1fd35b981c8d9899a43ab934711069b487d3e0c8 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 28 Dec 2020 13:10:19 +0100 Subject: [PATCH 29/70] :package: pre-release 2.22.0b1 --- CHANGELOG.md | 11 +++++++++++ ocrd_utils/setup.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 443d0c3017..b0cc5f7086 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,10 +5,20 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.22.0b1] - 2020-12-28 + Fixed: * `run_cli`: don't reference undefined vars in error handler, #651 +Changed: + + * Implement file resource algorithm from OCR-D/spec#169, #559 + +Added: + + * New CLI `ocrd resmgr` to download/browse processor resources, #559 + ## [2.21.0] - 2020-11-27 Changed: @@ -1240,6 +1250,7 @@ Fixed Initial Release +[2.22.0b1]: ../../compare/v2.22.0b1..v2.21.0 [2.21.0]: ../../compare/v2.21.0..v2.20.2 [2.20.2]: ../../compare/v2.20.2..v2.20.1 [2.20.1]: ../../compare/v2.20.1..v2.20.0 diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index a6d234fa67..f44d8a03c1 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='2.21.0', + version='2.22.0b1', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown', From 849de104b882916906b5596f1e641a093edafc68 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 30 Dec 2020 18:19:43 +0100 Subject: [PATCH 30/70] new PAGE XML user method get_AllTextLine --- ocrd/ocrd/processor/base.py | 2 +- ocrd_models/ocrd_models/ocrd_page_generateds.py | 11 ++++++++++- ocrd_models/ocrd_page_user_methods.py | 1 + ocrd_models/ocrd_page_user_methods/get_AllTextLine.py | 9 +++++++++ 4 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 ocrd_models/ocrd_page_user_methods/get_AllTextLine.py diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 90245aa465..dc495494c9 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -166,7 +166,7 @@ def resolve_resource(self, val): if not reslist: reslist = resmgr.find_resources(executable, url=val) if not reslist: - raise FileNotFoundError("Could not resolve '%s'" % val) + raise FileNotFoundError("Could not resolve %s resource '%s'" % (executable, val)) _, resdict = reslist[0] return str(resmgr.download( executable, diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 807ec0592b..06aaf7d539 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri Nov 27 15:25:12 2020 by generateDS.py version 2.35.20. +# Generated Wed Dec 30 18:18:58 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Oct 8 2020, 12:12:24) [GCC 8.4.0] # # Command line options: @@ -3087,6 +3087,15 @@ def set_Border(self, Border): """ self.invalidate_AlternativeImage(feature_selector='cropped') self.Border = Border + def get_AllTextLine(self): + """ + Return all the TextLine in the document + """ + ret = [] + for reg in self.get_AllRegions(['Text']): + ret += reg.get_TextLine() + return ret + def set_orientation(self, orientation): """ Set deskewing angle to given number. diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 44899076b3..28bce65a2b 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -113,6 +113,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(BorderType|RegionType|TextLineType|WordType|GlyphType)$', 'set_Coords'), _add_method(r'^(PageType)$', 'set_Border'), _add_method(r'^(CoordsType)$', 'set_points'), + _add_method(r'^(PageType)$', 'get_AllTextLine'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), ) diff --git a/ocrd_models/ocrd_page_user_methods/get_AllTextLine.py b/ocrd_models/ocrd_page_user_methods/get_AllTextLine.py new file mode 100644 index 0000000000..21cdd4a5a1 --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/get_AllTextLine.py @@ -0,0 +1,9 @@ +def get_AllTextLine(self): + """ + Return all the TextLine in the document + """ + ret = [] + for reg in self.get_AllRegions(['Text']): + ret += reg.get_TextLine() + return ret + From bf47a073813bbf2b432ca025b99b98187478a4cf Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 30 Dec 2020 18:45:38 +0100 Subject: [PATCH 31/70] update assets --- repo/assets | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/assets b/repo/assets index 32fde9eb24..04108234fc 160000 --- a/repo/assets +++ b/repo/assets @@ -1 +1 @@ -Subproject commit 32fde9eb242c595a1986a193090c689f52eeb734 +Subproject commit 04108234fcd2bcc615727726959cb688da8ae859 From db36dd382a4e7f63e94c380e57886df7117b5e8f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 30 Dec 2020 18:46:03 +0100 Subject: [PATCH 32/70] kraken resources --- ocrd/ocrd/resource_list.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index 053ec12880..5606d18fb7 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -114,3 +114,13 @@ ocrd-anybaseocr-tiseg: name: seg_model.hdf5 description: text image segmentation model for anybaseocr size: 66080688 +ocrd-kraken-segment: + - url: https://github.com/mittagessen/kraken/raw/master/kraken/blla.mlmodel + description: Pretrained baseline segmentation model + name: blla.mlmodel + size: 5046835 +ocrd-kraken-recognize: + - url: https://zenodo.org/record/2577813/files/en_best.mlmodel?download=1 + name: 10.5281/zenodo.2577813 + description: This model has been trained on a large corpus of modern printed English text\naugmented with ~10000 lines of historical pages + size: 2930723 From a571b82f5d674e630fcff46c1eb8a9a192fa071d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 30 Dec 2020 18:48:36 +0100 Subject: [PATCH 33/70] :package: pre-release 2.22.0b2 --- CHANGELOG.md | 7 +++++++ ocrd_utils/setup.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0cc5f7086..22c7928332 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.22.0b2] - 2020-12-30 + +Added: + + * PAGE API method `get_AllTextLine` + * resources for kraken + ## [2.22.0b1] - 2020-12-28 Fixed: diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index f44d8a03c1..955eddb7d1 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='2.22.0b1', + version='2.22.0b2', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown', From 3aa60a80cd6a14803d6acba15adf85be879bb61c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 30 Dec 2020 19:17:41 +0100 Subject: [PATCH 34/70] reslist: use name w/o slash --- ocrd/ocrd/resource_list.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index 5606d18fb7..31af8647e3 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -121,6 +121,6 @@ ocrd-kraken-segment: size: 5046835 ocrd-kraken-recognize: - url: https://zenodo.org/record/2577813/files/en_best.mlmodel?download=1 - name: 10.5281/zenodo.2577813 + name: en_best.mlmodel description: This model has been trained on a large corpus of modern printed English text\naugmented with ~10000 lines of historical pages size: 2930723 From 4bf12fb7eaa39fc2a73874df3b69c4b6b24b5965 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Dec 2020 16:39:53 +0100 Subject: [PATCH 35/70] :package: pre-release v2.22.0b3 --- CHANGELOG.md | 5 +++++ ocrd/ocrd/workspace.py | 6 +++--- ocrd_utils/setup.py | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22c7928332..ed2b4855d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.22.0b3] - 2020-12-30 + +Fixed: + * `name` of resources mustn't contain slash `/` + ## [2.22.0b2] - 2020-12-30 Added: diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index 61baa21ec7..ede1e5aa7f 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -2,7 +2,7 @@ from os import makedirs, unlink, listdir, path from pathlib import Path -import cv2 +from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor from PIL import Image import numpy as np from deprecated.sphinx import deprecated @@ -340,9 +340,9 @@ def _resolve_image_as_pil(self, image_url, coords=None): # FIXME: remove or replace this by (image_from_polygon+) crop_image ... log.debug("Converting PIL to OpenCV: %s", image_url) - color_conversion = cv2.COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else cv2.COLOR_RGB2BGR + color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else COLOR_RGB2BGR pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image) - cv2_image = cv2.cvtColor(pil_as_np_array, color_conversion) + cv2_image = cvtColor(pil_as_np_array, color_conversion) poly = np.array(coords, np.int32) log.debug("Cutting region %s from %s", coords, image_url) diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index 955eddb7d1..838a7c7952 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='2.22.0b2', + version='2.22.0b3', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown', From 2c26eb09bf9bff8ce74a9f44248b01ca5d6e9a73 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 4 Jan 2021 17:36:56 +0100 Subject: [PATCH 36/70] Update ocrd/ocrd/processor/base.py obsolete import Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd/ocrd/processor/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index dc495494c9..dbde1dba6a 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -16,7 +16,6 @@ import os import re import sys -from pkg_resources import resource_filename import requests From 02e641536907e27b118a29200ce33f6bb1a43e6a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 4 Jan 2021 18:01:14 +0100 Subject: [PATCH 37/70] rename PAGE method get_AllTextLine{,s} --- CHANGELOG.md | 2 +- ocrd_models/ocrd_models/ocrd_page_generateds.py | 4 ++-- ocrd_models/ocrd_page_user_methods.py | 2 +- .../{get_AllTextLine.py => get_AllTextLines.py} | 2 +- tests/model/test_ocrd_page.py | 5 +++++ 5 files changed, 10 insertions(+), 5 deletions(-) rename ocrd_models/ocrd_page_user_methods/{get_AllTextLine.py => get_AllTextLines.py} (85%) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed2b4855d3..cc29600c9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ Fixed: Added: - * PAGE API method `get_AllTextLine` + * PAGE API method `get_AllTextLines` * resources for kraken ## [2.22.0b1] - 2020-12-28 diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 06aaf7d539..8fd7157968 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Wed Dec 30 18:18:58 2020 by generateDS.py version 2.35.20. +# Generated Mon Jan 4 18:05:36 2021 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Oct 8 2020, 12:12:24) [GCC 8.4.0] # # Command line options: @@ -3087,7 +3087,7 @@ def set_Border(self, Border): """ self.invalidate_AlternativeImage(feature_selector='cropped') self.Border = Border - def get_AllTextLine(self): + def get_AllTextLines(self): """ Return all the TextLine in the document """ diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 28bce65a2b..376a76da17 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -113,7 +113,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(BorderType|RegionType|TextLineType|WordType|GlyphType)$', 'set_Coords'), _add_method(r'^(PageType)$', 'set_Border'), _add_method(r'^(CoordsType)$', 'set_points'), - _add_method(r'^(PageType)$', 'get_AllTextLine'), + _add_method(r'^(PageType)$', 'get_AllTextLines'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), ) diff --git a/ocrd_models/ocrd_page_user_methods/get_AllTextLine.py b/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py similarity index 85% rename from ocrd_models/ocrd_page_user_methods/get_AllTextLine.py rename to ocrd_models/ocrd_page_user_methods/get_AllTextLines.py index 21cdd4a5a1..594e8d789f 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllTextLine.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py @@ -1,4 +1,4 @@ -def get_AllTextLine(self): +def get_AllTextLines(self): """ Return all the TextLine in the document """ diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 470a86988f..07875106ed 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -266,6 +266,11 @@ def test_extend_AllIndexed_no_validation(self): rrs = og.get_RegionRefIndexed() self.assertEqual([x.index for x in rrs][-3:], [22, 23, 24]) + def test_get_AllTextLine(self): + with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: + page = parseString(f.read().encode('utf8'), silence=True).get_Page() + assert len(page.get_AllTextLines()) == 55 + def test_extend_AllIndexed_validate_continuity(self): with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() From 4def1d974e9645753db924b5ca0c5ed1e7682ed5 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 4 Jan 2021 18:29:54 +0100 Subject: [PATCH 38/70] OcrdPage.get_AllTextLines: support region_order, stub for textline_order --- ocrd_models/ocrd_models/ocrd_page_generateds.py | 16 +++++++++++++--- .../ocrd_page_user_methods/get_AllTextLines.py | 14 ++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 8fd7157968..c8ef5b2885 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Mon Jan 4 18:05:36 2021 by generateDS.py version 2.35.20. +# Generated Mon Jan 4 18:29:27 2021 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Oct 8 2020, 12:12:24) [GCC 8.4.0] # # Command line options: @@ -3087,12 +3087,22 @@ def set_Border(self, Border): """ self.invalidate_AlternativeImage(feature_selector='cropped') self.Border = Border - def get_AllTextLines(self): + def get_AllTextLines(self, region_order='document', textline_order='top-to-bottom'): """ Return all the TextLine in the document + + Arguments: + region_order ("document"|"reading-order"|"reading-order-only") Whether to + return regions sorted by document order (``document``, default) or by + reading order with regions not in the reading order at the end of the + returned list (``reading-order``) or regions not in the reading order + omitted (``reading-order-only``) + textline_order ("top-to-bottom"|"bottom-to-top"|left-to-right"|"right-to-left") + The order of text lines within a block (not currently used) """ + # TODO handle textLineOrder ret = [] - for reg in self.get_AllRegions(['Text']): + for reg in self.get_AllRegions(['Text'], order=region_order): ret += reg.get_TextLine() return ret diff --git a/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py b/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py index 594e8d789f..ad421cd19a 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py @@ -1,9 +1,19 @@ -def get_AllTextLines(self): +def get_AllTextLines(self, region_order='document', textline_order='top-to-bottom'): """ Return all the TextLine in the document + + Arguments: + region_order ("document"|"reading-order"|"reading-order-only") Whether to + return regions sorted by document order (``document``, default) or by + reading order with regions not in the reading order at the end of the + returned list (``reading-order``) or regions not in the reading order + omitted (``reading-order-only``) + textline_order ("top-to-bottom"|"bottom-to-top"|left-to-right"|"right-to-left") + The order of text lines within a block (not currently used) """ + # TODO handle textLineOrder ret = [] - for reg in self.get_AllRegions(['Text']): + for reg in self.get_AllRegions(['Text'], order=region_order): ret += reg.get_TextLine() return ret From 79116038b5a83d289eea3165b2e7b63699f66342 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Jan 2021 12:41:48 +0100 Subject: [PATCH 39/70] ocrd resmgr list-installed: look in fs for candidates --- ocrd/ocrd/resource_manager.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index e1060ea1b9..51ce03fb92 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -1,18 +1,19 @@ from pathlib import Path +from os.path import join +from os import environ, listdir import re -from shutil import copyfileobj, copytree -from tempfile import TemporaryFile +from shutil import copytree from tarfile import open as open_tarfile import requests from yaml import safe_load -from .constants import RESOURCE_LIST_FILENAME - from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger -from ocrd_utils.constants import HOME, XDG_CACHE_HOME -from ocrd_utils.os import list_resource_candidates, list_all_resources, pushd_popd +from ocrd_utils.constants import HOME, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME +from ocrd_utils.os import list_all_resources, pushd_popd + +from .constants import RESOURCE_LIST_FILENAME builtin_list_filename = Path(RESOURCE_LIST_FILENAME) user_list_filename = Path(HOME, 'ocrd', 'resources.yml') @@ -55,16 +56,27 @@ def list_installed(self, executable=None): List installed resources, matching with registry by ``name`` """ ret = [] - for executable in [executable] if executable else self.database.keys(): + if executable: + all_executables = [executable] + else: + # resources we know about + all_executables = list(self.database.keys()) + # resources in the file system + parent_dirs = [XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME] + if 'VIRTUAL_ENV' in environ: + parent_dirs += [join(environ['VIRTUAL_ENV'], 'share')] + for parent_dir in parent_dirs: + all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')] + for this_executable in set(all_executables): reslist = [] - for res_filename in list_all_resources(executable): + for res_filename in list_all_resources(this_executable): res_name = Path(res_filename).name - resdict = [x for x in self.database[executable] if x['name'] == res_name] + resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name] if not resdict: # TODO handle gracefully resdict = [{'name': res_name, 'url': '???', 'description': '???', 'version_range': '???'}] reslist.append(resdict[0]) - ret.append((executable, reslist)) + ret.append((this_executable, reslist)) return ret def find_resources(self, executable=None, name=None, url=None): From 54a214a01a2a883c1d7ea766729bd62087923505 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Jan 2021 12:49:06 +0100 Subject: [PATCH 40/70] resource_list.yml: typo: ocrd{,-cis}-ocropy-recognize --- ocrd/ocrd/resource_list.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index 31af8647e3..b62adc4730 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -55,7 +55,7 @@ ocrd-calamari-recognize: size: 90275264 path_in_archive: '.' version_range: '>= 1.0.0' -ocrd-ocropy-recognize: +ocrd-cis-ocropy-recognize: - url: https://github.com/zuphilip/ocropy-models/raw/master/en-default.pyrnn.gz name: en-default.pyrnn.gz description: Default ocropy model From e33346b4f94b9c975acb5efa8e6600f3984a8505 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Jan 2021 14:15:12 +0100 Subject: [PATCH 41/70] resmgr list-installed: create stub in user resource list for unregistered resources --- ocrd/ocrd/constants.py | 2 + ocrd/ocrd/resource_manager.py | 70 ++++++++++++++++++++++++++--------- ocrd_utils/ocrd_utils/os.py | 2 +- 3 files changed, 55 insertions(+), 19 deletions(-) diff --git a/ocrd/ocrd/constants.py b/ocrd/ocrd/constants.py index f82d2d3bdd..1d436a7fa9 100644 --- a/ocrd/ocrd/constants.py +++ b/ocrd/ocrd/constants.py @@ -11,6 +11,7 @@ 'BASHLIB_FILENAME', 'RESOURCE_LIST_FILENAME', 'BACKUP_DIR', + 'RESOURCE_USER_LIST_COMMENT', ] TMP_PREFIX = 'ocrd-core-' @@ -19,4 +20,5 @@ DEFAULT_REPOSITORY_URL = 'http://localhost:5000/' BASHLIB_FILENAME = resource_filename(__name__, 'lib.bash') RESOURCE_LIST_FILENAME = resource_filename(__name__, 'resource_list.yml') +RESOURCE_USER_LIST_COMMENT = "# OCR-D private resource list (consider sending a PR with your own resources to OCR-D/core)" BACKUP_DIR = '.backup' diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 51ce03fb92..8ef0df6097 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -3,20 +3,18 @@ from os import environ, listdir import re from shutil import copytree +from datetime import datetime from tarfile import open as open_tarfile import requests -from yaml import safe_load +from yaml import safe_load, safe_dump from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger from ocrd_utils.constants import HOME, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME from ocrd_utils.os import list_all_resources, pushd_popd -from .constants import RESOURCE_LIST_FILENAME - -builtin_list_filename = Path(RESOURCE_LIST_FILENAME) -user_list_filename = Path(HOME, 'ocrd', 'resources.yml') +from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT class OcrdResourceManager(): @@ -26,22 +24,31 @@ class OcrdResourceManager(): def __init__(self): self.log = getLogger('ocrd.resource_manager') self.database = {} - self.load_resource_list(builtin_list_filename) - self.load_resource_list(user_list_filename) + self.load_resource_list(Path(RESOURCE_LIST_FILENAME)) + self.user_list = Path(XDG_CONFIG_HOME, 'ocrd', 'resources.yml') + if not self.user_list.exists(): + if not self.user_list.parent.exists(): + self.user_list.parent.mkdir() + with open(str(self.user_list), 'w', encoding='utf-8') as f: + f.write(RESOURCE_USER_LIST_COMMENT) + self.load_resource_list(self.user_list) - def load_resource_list(self, list_filename): + def load_resource_list(self, list_filename, database=None): + if not database: + database = self.database if list_filename.is_file(): with open(list_filename, 'r', encoding='utf-8') as f: - list_loaded = safe_load(f) + list_loaded = safe_load(f) or {} report = OcrdResourceListValidator.validate(list_loaded) if not report.is_valid: self.log.error('\n'.join(report.errors)) raise ValueError("Resource list %s is invalid!" % (list_filename)) for executable, resource_list in list_loaded.items(): - if executable not in self.database: - self.database[executable] = [] + if executable not in database: + database[executable] = [] # Prepend, so user provided is sorted before builtin - self.database[executable] = list_loaded[executable] + self.database[executable] + database[executable] = list_loaded[executable] + database[executable] + return database def list_available(self, executable=None): """ @@ -73,21 +80,48 @@ def list_installed(self, executable=None): res_name = Path(res_filename).name resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name] if not resdict: - # TODO handle gracefully - resdict = [{'name': res_name, 'url': '???', 'description': '???', 'version_range': '???'}] + self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'" % (this_executable, res_name, res_filename, self.user_list)) + resdict = [self.add_to_user_database(this_executable, res_filename)] reslist.append(resdict[0]) ret.append((this_executable, reslist)) return ret - def find_resources(self, executable=None, name=None, url=None): + def add_to_user_database(self, executable, res_filename): + """ + Add a stub entry to the user resource.yml + """ + res_name = Path(res_filename).name + res_size = Path(res_filename).stat().st_size + with open(self.user_list, 'r', encoding='utf-8') as f: + user_database = safe_load(f) or {} + if executable not in user_database: + user_database[executable] = [] + if not self.find_resources(executable=executable, name=res_name, database=user_database): + resdict = { + 'name': res_name, + 'url': '???', + 'description': 'Found at %s on %s' % (res_filename, datetime.now()), + 'version_range': '???', + 'size': res_size + } + user_database[executable].append(resdict) + with open(self.user_list, 'w', encoding='utf-8') as f: + f.write(RESOURCE_USER_LIST_COMMENT) + f.write('\n') + f.write(safe_dump(user_database)) + return resdict + + def find_resources(self, executable=None, name=None, url=None, database=None): """ Find resources in the registry """ + if not database: + database = self.database ret = [] - if executable and executable not in self.database.keys(): + if executable and executable not in database.keys(): return ret - for executable in [executable] if executable else self.database.keys(): - for resdict in self.database[executable]: + for executable in [executable] if executable else database.keys(): + for resdict in database[executable]: if not name and not url: ret.append((executable, resdict)) elif url and url == resdict['url']: diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 64138dfead..d37b818d45 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -101,7 +101,7 @@ def list_all_resources(executable): for xdgdir in [join(d, executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME]]: if isdir(xdgdir): candidates += list(scandir(xdgdir)) - return candidates + return [x.path for x in candidates] # ht @pabs3 # https://github.com/untitaker/python-atomicwrites/issues/42 From 3ee66ce593bb166c2825b0e58e5034927ee06da3 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Jan 2021 15:13:35 +0100 Subject: [PATCH 42/70] resmgr: use last URL segment as the resource name --- ocrd/ocrd/resource_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 8ef0df6097..d8a66712d5 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -5,6 +5,7 @@ from shutil import copytree from datetime import datetime from tarfile import open as open_tarfile +from urllib.parse import urlparse import requests from yaml import safe_load, safe_dump @@ -164,7 +165,8 @@ def download( log = getLogger('ocrd.resource_manager.download') destdir = Path(basedir, executable) if not name: - name = re.sub('[^A-Za-z0-9]', '', url) + url_parsed = urlparse(url) + name = Path(url_parsed.path).name fpath = Path(destdir, name) if fpath.exists() and not overwrite: log.info("%s to be downloaded to %s which already exists and overwrite is False" % (url, fpath)) From b21d46286e33dbe74ca88dd272d21787e723c31d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Jan 2021 15:25:51 +0100 Subject: [PATCH 43/70] resmgr: unquote URL encoded path --- ocrd/ocrd/resource_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index d8a66712d5..adf79091ca 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -5,7 +5,7 @@ from shutil import copytree from datetime import datetime from tarfile import open as open_tarfile -from urllib.parse import urlparse +from urllib.parse import urlparse, unquote import requests from yaml import safe_load, safe_dump @@ -166,7 +166,7 @@ def download( destdir = Path(basedir, executable) if not name: url_parsed = urlparse(url) - name = Path(url_parsed.path).name + name = Path(unqote(url_parsed.path)).name fpath = Path(destdir, name) if fpath.exists() and not overwrite: log.info("%s to be downloaded to %s which already exists and overwrite is False" % (url, fpath)) From d8d97af1733b1f9b8bb525c779f7f1bee86cbefa Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 6 Jan 2021 13:38:39 +0100 Subject: [PATCH 44/70] resmgr: use GET instead of HEAD for content-length --- ocrd/ocrd/cli/resmgr.py | 4 ++-- ocrd/ocrd/resource_manager.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index bc82e14f6d..35ba0c5ff9 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -81,9 +81,9 @@ def download(any_url, overwrite, location, executable, url_or_name): log.info("No resources found in registry") if is_url and any_url: log.info("Downloading unregistered resource %s" % url_or_name) - with requests.head(url_or_name) as r: + with requests.get(url_or_name, stream=True) as r: content_length = int(r.headers.get('content-length')) - with click.progressbar(length=content_length) as bar: + with click.progressbar(length=content_length, label="Downloading") as bar: fpath = resmgr.download( executable, url_or_name, diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index adf79091ca..9d6ed038fd 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -166,7 +166,7 @@ def download( destdir = Path(basedir, executable) if not name: url_parsed = urlparse(url) - name = Path(unqote(url_parsed.path)).name + name = Path(unquote(url_parsed.path)).name fpath = Path(destdir, name) if fpath.exists() and not overwrite: log.info("%s to be downloaded to %s which already exists and overwrite is False" % (url, fpath)) From 509200c89ea2437ee820e80a34adc5ab5dcbc8ff Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 7 Jan 2021 18:58:51 +0100 Subject: [PATCH 45/70] resmgr: support "download" (=copying) of local files --- ocrd/ocrd/cli/resmgr.py | 24 ++++++++++++++---------- ocrd/ocrd/resource_manager.py | 28 ++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 35ba0c5ff9..480f468cb5 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -51,7 +51,7 @@ def list_installed(executable=None): print_resources(executable, reslist) @resmgr_cli.command('download') -@click.option('-n', '--any-url', help='Allow downloading unregistered resources', is_flag=True) +@click.option('-n', '--any-url', help='Allow downloading/copying unregistered resources', is_flag=True) @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) @click.option('-l', '--location', help='Where to store resources', type=click.Choice(['cache', 'config', 'data', 'cwd']), default='cache', show_default=True) @click.argument('executable', required=True) @@ -64,7 +64,7 @@ def download(any_url, overwrite, location, executable, url_or_name): If URL_OR_NAME is '*' (asterisk), download all known resources for this processor - If ``--any-url`` is given, also accepts URL of non-registered resources for ``URL_OR_NAME``. + If ``--any-url`` is given, also accepts URL or filenames of non-registered resources for ``URL_OR_NAME``. """ log = getLogger('ocrd.cli.resmgr') resmgr = OcrdResourceManager() @@ -73,25 +73,29 @@ def download(any_url, overwrite, location, executable, url_or_name): XDG_CONFIG_HOME if location == 'config' else \ getcwd() is_url = url_or_name.startswith('https://') or url_or_name.startswith('http://') + is_filename = Path(url_or_name).exists() find_kwargs = {'executable': executable} if url_or_name != '*': find_kwargs['url' if is_url else 'name'] = url_or_name reslist = resmgr.find_resources(**find_kwargs) if not reslist: log.info("No resources found in registry") - if is_url and any_url: - log.info("Downloading unregistered resource %s" % url_or_name) - with requests.get(url_or_name, stream=True) as r: - content_length = int(r.headers.get('content-length')) - with click.progressbar(length=content_length, label="Downloading") as bar: + if any_url and (is_url or is_filename): + log.info("%s unregistered resource %s" % ("Downloading" if is_url else "Copying", url_or_name)) + if is_url: + with requests.get(url_or_name, stream=True) as r: + content_length = int(r.headers.get('content-length')) + else: + url_or_name = str(Path(url_or_name).resolve()) + content_length = Path(url_or_name).stat().st_size + with click.progressbar(length=content_length, label="Downloading" if is_url else "Copying") as bar: fpath = resmgr.download( executable, url_or_name, overwrite=overwrite, basedir=basedir, - progress_cb=lambda delta: bar.update(delta) - ) - log.info("Downloaded %s to %s" % (url_or_name, fpath)) + progress_cb=lambda delta: bar.update(delta)) + log.info("%s %s to %s" % ("Downloaded" if is_url else "Copied", url_or_name, fpath)) log.info("Use in parameters as '%s'" % fpath.name) else: sys.exit(1) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 9d6ed038fd..4a3b5aea8b 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -138,15 +138,29 @@ def parameter_usage(self, name, usage='as-is'): return Path(name).stem def _download_impl(self, url, filename, progress_cb=None): + log = getLogger('ocrd.resource_manager._download_impl') + log.info("Downloading %s" % url) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: total = int(r.headers.get('content-length')) - # copyfileobj(r.raw, f_write_tar) for data in r.iter_content(chunk_size=4096): if progress_cb: progress_cb(len(data)) f.write(data) + def _copy_impl(self, src_filename, filename, progress_cb=None): + log = getLogger('ocrd.resource_manager._copy_impl') + log.info("Copying %s" % src_filename) + with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in: + while True: + chunk = f_in.read(4096) + if chunk: + f_out.write(chunk) + if progress_cb: + progress_cb(len(chunk)) + else: + break + # TODO Proper caching (make head request for size, If-Modified etc) def download( self, @@ -168,16 +182,22 @@ def download( url_parsed = urlparse(url) name = Path(unquote(url_parsed.path)).name fpath = Path(destdir, name) + is_url = url.startswith('https://') or url.startswith('http://') if fpath.exists() and not overwrite: log.info("%s to be downloaded to %s which already exists and overwrite is False" % (url, fpath)) return fpath destdir.mkdir(parents=True, exist_ok=True) if resource_type == 'file': - self._download_impl(url, fpath, progress_cb) + if is_url: + self._download_impl(url, fpath, progress_cb) + else: + self._copy_impl(url, fpath, progress_cb) elif resource_type == 'tarball': with pushd_popd(tempdir=True): - log.info("Downloading %s" % url) - self._download_impl(url, 'download.tar.xx', progress_cb) + if is_url: + self._download_impl(url, 'download.tar.xx', progress_cb) + else: + self._copy_impl(url, 'download.tar.xx', progress_cb) Path('out').mkdir() with pushd_popd('out'): log.info("Extracting tarball") From cbbc09a81180afbead2bd35822661cac2f5cd075 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 12 Jan 2021 13:18:39 +0100 Subject: [PATCH 46/70] resmgr, introduce intermediary "ocrd-resource" dir --- ocrd/ocrd/cli/resmgr.py | 7 ++++--- ocrd/ocrd/resource_manager.py | 9 +++++---- ocrd_utils/ocrd_utils/os.py | 12 ++++++------ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 480f468cb5..1ebf546aa5 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -1,5 +1,6 @@ import sys from os import getcwd +from os.path import join from pathlib import Path import requests @@ -68,9 +69,9 @@ def download(any_url, overwrite, location, executable, url_or_name): """ log = getLogger('ocrd.cli.resmgr') resmgr = OcrdResourceManager() - basedir = XDG_CACHE_HOME if location == 'cache' else \ - XDG_DATA_HOME if location == 'data' else \ - XDG_CONFIG_HOME if location == 'config' else \ + basedir = join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ + join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ + join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \ getcwd() is_url = url_or_name.startswith('https://') or url_or_name.startswith('http://') is_filename = Path(url_or_name).exists() diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 4a3b5aea8b..43dc842390 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -70,11 +70,12 @@ def list_installed(self, executable=None): # resources we know about all_executables = list(self.database.keys()) # resources in the file system - parent_dirs = [XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME] + parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME]] if 'VIRTUAL_ENV' in environ: - parent_dirs += [join(environ['VIRTUAL_ENV'], 'share')] + parent_dirs += [join(environ['VIRTUAL_ENV'], 'share', 'ocrd-resources')] for parent_dir in parent_dirs: - all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')] + if Path(parent_dir).exists(): + all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')] for this_executable in set(all_executables): reslist = [] for res_filename in list_all_resources(this_executable): @@ -184,7 +185,7 @@ def download( fpath = Path(destdir, name) is_url = url.startswith('https://') or url.startswith('http://') if fpath.exists() and not overwrite: - log.info("%s to be downloaded to %s which already exists and overwrite is False" % (url, fpath)) + log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath)) return fpath destdir.mkdir(parents=True, exist_ok=True) if resource_type == 'file': diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index d37b818d45..ff44b489e5 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -65,7 +65,7 @@ def list_resource_candidates(executable, fname, cwd=getcwd(), is_file=False, is_ https://ocr-d.de/en/spec/ocrd_tool#file-parameters (except python-bundled) """ candidates = [] - candidates.append(join(cwd, fname)) + candidates.append(join(cwd, 'ocrd-resources', fname)) processor_path_var = '%s_PATH' % executable.replace('-', '_').upper() if processor_path_var in environ: candidates += [join(x, fname) for x in environ[processor_path_var].split(':')] @@ -86,19 +86,19 @@ def list_all_resources(executable): https://ocr-d.de/en/spec/ocrd_tool#file-parameters (except python-bundled) """ candidates = [] - # XXX this will produce too many false positives - # for root, dirs, files in walk(cwd): - # candidates += files + cwd_candidate = join(getcwd(), 'ocrd-resources', executable) + if Path(cwd_candidate).exists(): + candidates.append(cwd_candidate) processor_path_var = '%s_PATH' % executable.replace('-', '_').upper() if processor_path_var in environ: for processor_path in environ[processor_path_var].split(':'): if isdir(processor_path): candidates += list(scandir(processor_path)) if 'VIRTUAL_ENV' in environ: - sharedir = join(environ['VIRTUAL_ENV'], 'share', executable) + sharedir = join(environ['VIRTUAL_ENV'], 'share', 'ocrd-resources', executable) if isdir(sharedir): candidates += list(scandir(sharedir)) - for xdgdir in [join(d, executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME]]: + for xdgdir in [join(d, 'ocrd-resources', executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME]]: if isdir(xdgdir): candidates += list(scandir(xdgdir)) return [x.path for x in candidates] From 7b1b6c9a6348faf279adcd5f3bdd9ce924e86699 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 12 Jan 2021 15:31:28 +0100 Subject: [PATCH 47/70] default to VIRTUAL_ENV sharedir --- ocrd/ocrd/cli/resmgr.py | 6 ++++-- ocrd/ocrd/resource_manager.py | 6 +++--- ocrd_utils/ocrd_utils/__init__.py | 1 + ocrd_utils/ocrd_utils/constants.py | 2 ++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 1ebf546aa5..367c5afbf7 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -9,6 +9,7 @@ from ocrd_utils import ( initLogging, getLogger, + VIRTUAL_ENV, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME @@ -54,7 +55,7 @@ def list_installed(executable=None): @resmgr_cli.command('download') @click.option('-n', '--any-url', help='Allow downloading/copying unregistered resources', is_flag=True) @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) -@click.option('-l', '--location', help='Where to store resources', type=click.Choice(['cache', 'config', 'data', 'cwd']), default='cache', show_default=True) +@click.option('-l', '--location', help='Where to store resources', type=click.Choice(['virtualenv', 'cache', 'config', 'data', 'cwd']), default='virtualenv', show_default=True) @click.argument('executable', required=True) @click.argument('url_or_name', required=True) def download(any_url, overwrite, location, executable, url_or_name): @@ -69,7 +70,8 @@ def download(any_url, overwrite, location, executable, url_or_name): """ log = getLogger('ocrd.cli.resmgr') resmgr = OcrdResourceManager() - basedir = join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ + basedir = join(VIRTUAL_ENV, 'ocrd-resources') if location == 'virtualenv' and VIRTUAL_ENV else \ + join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \ getcwd() diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 43dc842390..0f016c6882 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -12,7 +12,7 @@ from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger -from ocrd_utils.constants import HOME, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME +from ocrd_utils.constants import HOME, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME, VIRTUAL_ENV from ocrd_utils.os import list_all_resources, pushd_popd from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT @@ -71,8 +71,8 @@ def list_installed(self, executable=None): all_executables = list(self.database.keys()) # resources in the file system parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME]] - if 'VIRTUAL_ENV' in environ: - parent_dirs += [join(environ['VIRTUAL_ENV'], 'share', 'ocrd-resources')] + if VIRTUAL_ENV: + parent_dirs += [join(VIRTUAL_ENV, 'share', 'ocrd-resources')] for parent_dir in parent_dirs: if Path(parent_dir).exists(): all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')] diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index f574ab76af..030059bec3 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -79,6 +79,7 @@ LOG_FORMAT, LOG_TIMEFMT, VERSION, + VIRTUAL_ENV, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME) diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index 38e050de91..262d95974a 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -17,6 +17,7 @@ 'REGEX_PREFIX', 'REGEX_FILE_ID', 'VERSION', + 'VIRTUAL_ENV', 'XDG_CONFIG_HOME', 'XDG_DATA_HOME', 'XDG_CACHE_HOME', @@ -105,3 +106,4 @@ XDG_DATA_HOME = environ['XDG_DATA_HOME'] if 'XDG_DATA_HOME' in environ else join(HOME, '.local', 'share') XDG_CONFIG_HOME = environ['XDG_CONFIG_HOME'] if 'XDG_CONFIG_HOME' in environ else join(HOME, '.config') XDG_CACHE_HOME = environ['XDG_CACHE_HOME'] if 'XDG_CACHE_HOME' in environ else join(HOME, '.cache') +VIRTUAL_ENV = environ.get('VIRTUAL_ENV', None) From 565ba38e52ea8cee18675605b009beba98d44725 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 12 Jan 2021 15:50:32 +0100 Subject: [PATCH 48/70] resmgr: save stub on download --- ocrd/ocrd/cli/resmgr.py | 2 ++ ocrd/ocrd/resource_manager.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 367c5afbf7..18e5142cec 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -98,6 +98,8 @@ def download(any_url, overwrite, location, executable, url_or_name): overwrite=overwrite, basedir=basedir, progress_cb=lambda delta: bar.update(delta)) + log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'" % (executable, fpath.name, url_or_name, resmgr.user_list)) + resmgr.add_to_user_database(executable, fpath, url_or_name) log.info("%s %s to %s" % ("Downloaded" if is_url else "Copied", url_or_name, fpath)) log.info("Use in parameters as '%s'" % fpath.name) else: diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 0f016c6882..23269ea11e 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -88,7 +88,7 @@ def list_installed(self, executable=None): ret.append((this_executable, reslist)) return ret - def add_to_user_database(self, executable, res_filename): + def add_to_user_database(self, executable, res_filename, url=None): """ Add a stub entry to the user resource.yml """ @@ -101,7 +101,7 @@ def add_to_user_database(self, executable, res_filename): if not self.find_resources(executable=executable, name=res_name, database=user_database): resdict = { 'name': res_name, - 'url': '???', + 'url': url if url else '???', 'description': 'Found at %s on %s' % (res_filename, datetime.now()), 'version_range': '???', 'size': res_size From 012e49ea0726e73da1656ffb4591069ed002f3b1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 12 Jan 2021 18:49:40 +0100 Subject: [PATCH 49/70] get_AllTextLines: implement textlineOrder --- ocrd_models/ocrd_models/ocrd_page_generateds.py | 16 ++++++++++------ .../ocrd_page_user_methods/get_AllTextLines.py | 14 +++++++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index c8ef5b2885..e6cd8b9495 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Mon Jan 4 18:29:27 2021 by generateDS.py version 2.35.20. +# Generated Tue Jan 12 18:51:26 2021 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Oct 8 2020, 12:12:24) [GCC 8.4.0] # # Command line options: @@ -3087,7 +3087,7 @@ def set_Border(self, Border): """ self.invalidate_AlternativeImage(feature_selector='cropped') self.Border = Border - def get_AllTextLines(self, region_order='document', textline_order='top-to-bottom'): + def get_AllTextLines(self, region_order='document', respect_textline_order=True): """ Return all the TextLine in the document @@ -3097,13 +3097,17 @@ def get_AllTextLines(self, region_order='document', textline_order='top-to-botto reading order with regions not in the reading order at the end of the returned list (``reading-order``) or regions not in the reading order omitted (``reading-order-only``) - textline_order ("top-to-bottom"|"bottom-to-top"|left-to-right"|"right-to-left") - The order of text lines within a block (not currently used) + respect_textline_order (boolean) Whether to respect textlineOrder attribute """ - # TODO handle textLineOrder + # TODO handle textLineOrder according to https://github.com/PRImA-Research-Lab/PAGE-XML/issues/26 ret = [] for reg in self.get_AllRegions(['Text'], order=region_order): - ret += reg.get_TextLine() + lines = reg.get_TextLine() + if not respect_textline_order: + ret += lines + else: + lo = reg.get_textLineOrder() or self.get_textLineOrder() or 'top-to-bottom' + ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines)) return ret def set_orientation(self, orientation): diff --git a/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py b/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py index ad421cd19a..ce31a89d43 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllTextLines.py @@ -1,4 +1,4 @@ -def get_AllTextLines(self, region_order='document', textline_order='top-to-bottom'): +def get_AllTextLines(self, region_order='document', respect_textline_order=True): """ Return all the TextLine in the document @@ -8,12 +8,16 @@ def get_AllTextLines(self, region_order='document', textline_order='top-to-botto reading order with regions not in the reading order at the end of the returned list (``reading-order``) or regions not in the reading order omitted (``reading-order-only``) - textline_order ("top-to-bottom"|"bottom-to-top"|left-to-right"|"right-to-left") - The order of text lines within a block (not currently used) + respect_textline_order (boolean) Whether to respect textlineOrder attribute """ - # TODO handle textLineOrder + # TODO handle textLineOrder according to https://github.com/PRImA-Research-Lab/PAGE-XML/issues/26 ret = [] for reg in self.get_AllRegions(['Text'], order=region_order): - ret += reg.get_TextLine() + lines = reg.get_TextLine() + if not respect_textline_order: + ret += lines + else: + lo = reg.get_textLineOrder() or self.get_textLineOrder() or 'top-to-bottom' + ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines)) return ret From 199b430e0201c2623736926e166b19537202429f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 12 Jan 2021 19:02:31 +0100 Subject: [PATCH 50/70] resmgr: ocrd-resources also for list_resource_candidates --- ocrd_utils/ocrd_utils/os.py | 8 ++++---- tests/utils/test_os.py | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index ff44b489e5..75ce5c6875 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -70,10 +70,10 @@ def list_resource_candidates(executable, fname, cwd=getcwd(), is_file=False, is_ if processor_path_var in environ: candidates += [join(x, fname) for x in environ[processor_path_var].split(':')] if 'VIRTUAL_ENV' in environ: - candidates.append(join(environ['VIRTUAL_ENV'], 'share', executable, fname)) - candidates.append(join(XDG_DATA_HOME, executable, fname)) - candidates.append(join(XDG_CONFIG_HOME, executable, fname)) - candidates.append(join(XDG_CACHE_HOME, executable, fname)) + candidates.append(join(environ['VIRTUAL_ENV'], 'share', 'ocrd-resources', executable, fname)) + candidates.append(join(XDG_DATA_HOME, 'ocrd-resources', executable, fname)) + candidates.append(join(XDG_CONFIG_HOME, 'ocrd-resources', executable, fname)) + candidates.append(join(XDG_CACHE_HOME, 'ocrd-resources', executable, fname)) if is_file: candidates = [c for c in candidates if Path(c).is_file()] if is_dir: diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py index f2cd6efec9..ca33240413 100644 --- a/tests/utils/test_os.py +++ b/tests/utils/test_os.py @@ -11,6 +11,7 @@ class TestOsUtils(TestCase): def setUp(self): + self.maxDiff = None self.tempdir_path = mkdtemp() self.tempdir_venv = mkdtemp() ENV['OCRD_DUMMY_PATH'] = self.tempdir_path @@ -34,12 +35,12 @@ def dehomify(s): cands = [dehomify(x) for x in cands] print(cands) self.assertEqual(cands, [join(x, fname) for x in [ - dehomify(getcwd()), + dehomify(join(getcwd(), 'ocrd-resources')), dehomify(self.tempdir_path), - dehomify(join(self.tempdir_venv, 'share', 'ocrd-dummy')), - '$HOME/.local/share/ocrd-dummy', - '$HOME/.config/ocrd-dummy', - '$HOME/.cache/ocrd-dummy', + dehomify(join(self.tempdir_venv, 'share', 'ocrd-resources', 'ocrd-dummy')), + '$HOME/.local/share/ocrd-resources/ocrd-dummy', + '$HOME/.config/ocrd-resources/ocrd-dummy', + '$HOME/.cache/ocrd-resources/ocrd-dummy', ]]) From 83498075ed2a0798586ab9427e58c37c9bde6b2a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 14 Jan 2021 13:01:13 +0100 Subject: [PATCH 51/70] resmgr: add @stweil's ONB model to list --- ocrd/ocrd/resource_list.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocrd/ocrd/resource_list.yml b/ocrd/ocrd/resource_list.yml index b62adc4730..3734e1dc89 100644 --- a/ocrd/ocrd/resource_list.yml +++ b/ocrd/ocrd/resource_list.yml @@ -5,6 +5,11 @@ ocrd-tesserocr-recognize: parameter_usage: 'without-extension' description: Tesseract LSTM model trained on GT4HistOCR size: 1058487 + - url: https://ub-backup.bib.uni-mannheim.de/~stweil/ocrd-train/data/ONB/tessdata_best/ONB_1.195_300718_989100.traineddata + name: ONB.traineddata + parameter_usage: 'without-extension' + description: Tesseract LSTM model based on Austrian National Library newspaper data + size: 4358948 - url: https://github.com/tesseract-ocr/tessdata_fast/raw/master/equ.traineddata name: equ.traineddata parameter_usage: 'without-extension' From 7840b5b54f7713e5beb062a169eeaa087ba5c076 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 18 Jan 2021 13:09:04 +0100 Subject: [PATCH 52/70] resmgr: when wildcard downloading, omit ??? user entries --- ocrd/ocrd/cli/resmgr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 18e5142cec..2250b7189a 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -106,6 +106,9 @@ def download(any_url, overwrite, location, executable, url_or_name): sys.exit(1) else: for _, resdict in reslist: + if resdict['url'] == '???': + log.info("Cannot download user resource %s" % (resdict['name'])), + continue log.info("Downloading resource %s" % resdict) with click.progressbar(length=resdict['size']) as bar: fpath = resmgr.download( From 503800514a310fba9eaa9a99a4ca052f35de6f01 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 19 Jan 2021 17:17:33 +0100 Subject: [PATCH 53/70] add a config file $XDG_CONFIG_HOME/ocrd.yml --- ocrd/ocrd/cli/resmgr.py | 3 ++- ocrd/ocrd/config.py | 25 +++++++++++++++++++ .../ocrd_modelfactory/__init__.py | 3 ++- ocrd_models/ocrd_models/__init__.py | 1 + ocrd_models/ocrd_models/ocrd_config.py | 25 +++++++++++++++++++ ocrd_utils/ocrd_utils/__init__.py | 1 + ocrd_utils/ocrd_utils/constants.py | 3 +++ ocrd_validators/ocrd_validators/__init__.py | 2 ++ ocrd_validators/ocrd_validators/constants.py | 1 + .../ocrd_validators/ocrd_config.schema.yml | 7 ++++++ .../ocrd_validators/ocrd_config_validator.py | 22 ++++++++++++++++ tests/test_ocrd_config.py | 18 +++++++++++++ 12 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 ocrd/ocrd/config.py create mode 100644 ocrd_models/ocrd_models/ocrd_config.py create mode 100644 ocrd_validators/ocrd_validators/ocrd_config.schema.yml create mode 100644 ocrd_validators/ocrd_validators/ocrd_config_validator.py create mode 100644 tests/test_ocrd_config.py diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 2250b7189a..de46729ba2 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -10,6 +10,7 @@ initLogging, getLogger, VIRTUAL_ENV, + RESOURCE_LOCATIONS, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME @@ -55,7 +56,7 @@ def list_installed(executable=None): @resmgr_cli.command('download') @click.option('-n', '--any-url', help='Allow downloading/copying unregistered resources', is_flag=True) @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) -@click.option('-l', '--location', help='Where to store resources', type=click.Choice(['virtualenv', 'cache', 'config', 'data', 'cwd']), default='virtualenv', show_default=True) +@click.option('-l', '--location', help='Where to store resources', type=click.Choice(RESOURCE_LOCATIONS), default='virtualenv', show_default=True) @click.argument('executable', required=True) @click.argument('url_or_name', required=True) def download(any_url, overwrite, location, executable, url_or_name): diff --git a/ocrd/ocrd/config.py b/ocrd/ocrd/config.py new file mode 100644 index 0000000000..86ef5f5d48 --- /dev/null +++ b/ocrd/ocrd/config.py @@ -0,0 +1,25 @@ +from pathlib import Path +from datetime import datetime + +from yaml import safe_load, safe_dump + +from ocrd_models import OcrdConfig +from ocrd_utils import XDG_CONFIG_HOME, VERSION +from ocrd_validators import OcrdConfigValidator +from ocrd_models.ocrd_config import DEFAULT_CONFIG + +def load_config_file(): + """ + Load the configuration file + """ + fpath = Path(XDG_CONFIG_HOME, 'ocrd.yml') + if not fpath.exists(): + with open(str(fpath), 'w', encoding='utf-8') as f_out: + f_out.write("# Generated by OCR-D/core %s on %s\n" % (VERSION, datetime.now())) + f_out.write(safe_dump(DEFAULT_CONFIG)) + with open(str(fpath), 'r', encoding='utf-8') as f_in: + obj = safe_load(f_in.read()) + report = OcrdConfigValidator.validate(obj) + if not report.is_valid: + raise ValueError("The configuration is invalid: %s" % report.errors) + return OcrdConfig(obj) diff --git a/ocrd_modelfactory/ocrd_modelfactory/__init__.py b/ocrd_modelfactory/ocrd_modelfactory/__init__.py index 8a27781f6a..3e802a6810 100644 --- a/ocrd_modelfactory/ocrd_modelfactory/__init__.py +++ b/ocrd_modelfactory/ocrd_modelfactory/__init__.py @@ -5,11 +5,12 @@ """ from datetime import datetime from pathlib import Path +from yaml import safe_load, safe_dump from PIL import Image from ocrd_utils import VERSION, MIMETYPE_PAGE -from ocrd_models import OcrdExif +from ocrd_models import OcrdExif, OcrdConfig from ocrd_models.ocrd_page import PcGtsType, PageType, MetadataType, parse __all__ = [ diff --git a/ocrd_models/ocrd_models/__init__.py b/ocrd_models/ocrd_models/__init__.py index 9a31a2d4c7..c29414690f 100644 --- a/ocrd_models/ocrd_models/__init__.py +++ b/ocrd_models/ocrd_models/__init__.py @@ -2,6 +2,7 @@ APIs and schemas for various file formats in the OCR domain. """ from .ocrd_agent import OcrdAgent +from .ocrd_config import OcrdConfig from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile from .ocrd_mets import OcrdMets diff --git a/ocrd_models/ocrd_models/ocrd_config.py b/ocrd_models/ocrd_models/ocrd_config.py new file mode 100644 index 0000000000..aeb7db14f3 --- /dev/null +++ b/ocrd_models/ocrd_models/ocrd_config.py @@ -0,0 +1,25 @@ +""" +Configuration file +""" +import json + +DEFAULT_CONFIG = { + 'resource_location': 'virtualenv' +} + +class OcrdConfig(): + + __slots__ = DEFAULT_CONFIG.keys() + + def __str__(self): + return 'OcrdConfig %s' % json.dumps(self.__dict__) + + def dump(self): + ret = {} + for k in DEFAULT_CONFIG.keys(): + ret[k] = getattr(self, k) + return ret + + def __init__(self, obj): + for k, v in obj.items(): + setattr(self, k, v) diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index 030059bec3..141cc6814b 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -76,6 +76,7 @@ PIL_TO_MIME, REGEX_PREFIX, REGEX_FILE_ID, + RESOURCE_LOCATIONS, LOG_FORMAT, LOG_TIMEFMT, VERSION, diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index 262d95974a..392011072c 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -16,6 +16,7 @@ 'PIL_TO_MIME', 'REGEX_PREFIX', 'REGEX_FILE_ID', + 'RESOURCE_LOCATIONS', 'VERSION', 'VIRTUAL_ENV', 'XDG_CONFIG_HOME', @@ -107,3 +108,5 @@ XDG_CONFIG_HOME = environ['XDG_CONFIG_HOME'] if 'XDG_CONFIG_HOME' in environ else join(HOME, '.config') XDG_CACHE_HOME = environ['XDG_CACHE_HOME'] if 'XDG_CACHE_HOME' in environ else join(HOME, '.cache') VIRTUAL_ENV = environ.get('VIRTUAL_ENV', None) + +RESOURCE_LOCATIONS = ['virtualenv', 'cwd', 'cache', 'config', 'data'] diff --git a/ocrd_validators/ocrd_validators/__init__.py b/ocrd_validators/ocrd_validators/__init__.py index 4819017dd0..e2ce63e743 100644 --- a/ocrd_validators/ocrd_validators/__init__.py +++ b/ocrd_validators/ocrd_validators/__init__.py @@ -6,6 +6,7 @@ 'WorkspaceValidator', 'PageValidator', 'OcrdToolValidator', + 'OcrdConfigValidator', 'OcrdResourceListValidator', 'OcrdZipValidator', 'XsdValidator', @@ -17,6 +18,7 @@ from .workspace_validator import WorkspaceValidator from .page_validator import PageValidator from .ocrd_tool_validator import OcrdToolValidator +from .ocrd_config_validator import OcrdConfigValidator from .resource_list_validator import OcrdResourceListValidator from .ocrd_zip_validator import OcrdZipValidator from .xsd_validator import XsdValidator diff --git a/ocrd_validators/ocrd_validators/constants.py b/ocrd_validators/ocrd_validators/constants.py index 25d2e0e53b..cfb9085f5c 100644 --- a/ocrd_validators/ocrd_validators/constants.py +++ b/ocrd_validators/ocrd_validators/constants.py @@ -20,6 +20,7 @@ OCRD_TOOL_SCHEMA = yaml.safe_load(resource_string(__name__, 'ocrd_tool.schema.yml')) RESOURCE_LIST_SCHEMA = yaml.safe_load(resource_string(__name__, 'resource_list.schema.yml')) +CONFIG_SCHEMA = yaml.safe_load(resource_string(__name__, 'ocrd_config.schema.yml')) OCRD_BAGIT_PROFILE = yaml.safe_load(resource_string(__name__, 'bagit-profile.yml')) BAGIT_TXT = 'BagIt-Version: 1.0\nTag-File-Character-Encoding: UTF-8' diff --git a/ocrd_validators/ocrd_validators/ocrd_config.schema.yml b/ocrd_validators/ocrd_validators/ocrd_config.schema.yml new file mode 100644 index 0000000000..114216176a --- /dev/null +++ b/ocrd_validators/ocrd_validators/ocrd_config.schema.yml @@ -0,0 +1,7 @@ +type: object +additionalProperties: true +properties: + resource_location: + type: string + enum: ['virtualenv', 'cache', 'config', 'data', 'cwd'] + default: 'virtualenv' diff --git a/ocrd_validators/ocrd_validators/ocrd_config_validator.py b/ocrd_validators/ocrd_validators/ocrd_config_validator.py new file mode 100644 index 0000000000..40e4c1ac65 --- /dev/null +++ b/ocrd_validators/ocrd_validators/ocrd_config_validator.py @@ -0,0 +1,22 @@ +""" +Validating $HOME/.config/ocrd.yml +""" +from .constants import CONFIG_SCHEMA +from .json_validator import JsonValidator + +# +# ------------------------------------------------- +# + +class OcrdConfigValidator(JsonValidator): + """ + JsonValidator validating against the ``ocrd-tool.json`` schema. + """ + + @staticmethod + def validate(obj, schema=CONFIG_SCHEMA): + """ + Validate against ``ocrd_config.schema.yml`` schema. + """ + return JsonValidator.validate(obj, schema) + diff --git a/tests/test_ocrd_config.py b/tests/test_ocrd_config.py new file mode 100644 index 0000000000..8d25dbd92e --- /dev/null +++ b/tests/test_ocrd_config.py @@ -0,0 +1,18 @@ +from tests.base import main +from unittest import mock + +import ocrd_utils + +def test_config_loading(): + XDG_CONFIG_HOME_before = ocrd_utils.XDG_CONFIG_HOME + with ocrd_utils.pushd_popd(tempdir=True) as tempdir: + ocrd_utils.XDG_CONFIG_HOME = tempdir + with open('ocrd.yml', 'w', encoding='utf-8') as f: + f.write('resource_location: cache\n') + from ocrd.config import load_config_file + obj = load_config_file() + assert obj.dump() == {'resource_location': 'cache'} + ocrd_utils.XDG_CONFIG_HOME = XDG_CONFIG_HOME_before + +if __name__ == '__main__': + main(__file__) From 2ab21517ef052b053040bb685862dba6442b27b1 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 19 Jan 2021 17:57:48 +0100 Subject: [PATCH 54/70] ocrd resmgr: use resource_location from config for default --- ocrd/ocrd/cli/resmgr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index de46729ba2..8d5887718d 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -18,6 +18,9 @@ from ocrd_validators import OcrdZipValidator from ..resource_manager import OcrdResourceManager +from ..config import load_config_file + +config = load_config_file() def print_resources(executable, reslist): print('%s' % executable) @@ -56,7 +59,7 @@ def list_installed(executable=None): @resmgr_cli.command('download') @click.option('-n', '--any-url', help='Allow downloading/copying unregistered resources', is_flag=True) @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) -@click.option('-l', '--location', help='Where to store resources', type=click.Choice(RESOURCE_LOCATIONS), default='virtualenv', show_default=True) +@click.option('-l', '--location', help='Where to store resources', type=click.Choice(RESOURCE_LOCATIONS), default=config.resource_location, show_default=True) @click.argument('executable', required=True) @click.argument('url_or_name', required=True) def download(any_url, overwrite, location, executable, url_or_name): From 46878868ca2d35572037cc40b734d575554f9c93 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 19 Jan 2021 17:59:53 +0100 Subject: [PATCH 55/70] config: merge with default config for updated config --- ocrd/ocrd/config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/config.py b/ocrd/ocrd/config.py index 86ef5f5d48..ada3435e02 100644 --- a/ocrd/ocrd/config.py +++ b/ocrd/ocrd/config.py @@ -13,12 +13,13 @@ def load_config_file(): Load the configuration file """ fpath = Path(XDG_CONFIG_HOME, 'ocrd.yml') + obj = DEFAULT_CONFIG if not fpath.exists(): with open(str(fpath), 'w', encoding='utf-8') as f_out: f_out.write("# Generated by OCR-D/core %s on %s\n" % (VERSION, datetime.now())) - f_out.write(safe_dump(DEFAULT_CONFIG)) + f_out.write(safe_dump(obj)) with open(str(fpath), 'r', encoding='utf-8') as f_in: - obj = safe_load(f_in.read()) + obj = {**obj, **safe_load(f_in.read())} report = OcrdConfigValidator.validate(obj) if not report.is_valid: raise ValueError("The configuration is invalid: %s" % report.errors) From 032929e63782e55c53eeb070d84cdf9d15c7ea73 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 19 Jan 2021 18:03:16 +0100 Subject: [PATCH 56/70] move config file to $XDG_CONFIG_HOME/ocrd/config.yml for consistency --- ocrd/ocrd/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/config.py b/ocrd/ocrd/config.py index ada3435e02..fafa295dac 100644 --- a/ocrd/ocrd/config.py +++ b/ocrd/ocrd/config.py @@ -12,7 +12,9 @@ def load_config_file(): """ Load the configuration file """ - fpath = Path(XDG_CONFIG_HOME, 'ocrd.yml') + fpath = Path(XDG_CONFIG_HOME, 'ocrd', 'config.yml') + if not fpath.parent.exists(): + fpath.parent.mkdir() obj = DEFAULT_CONFIG if not fpath.exists(): with open(str(fpath), 'w', encoding='utf-8') as f_out: From c6a53b02d1cfa95dc679cd5147ed275f6c203345 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 19 Jan 2021 18:58:24 +0100 Subject: [PATCH 57/70] resource manager: methods to resolve resource dirs --- ocrd/ocrd/cli/resmgr.py | 6 +----- ocrd/ocrd/resource_manager.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 8d5887718d..7cbcb035d2 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -74,11 +74,7 @@ def download(any_url, overwrite, location, executable, url_or_name): """ log = getLogger('ocrd.cli.resmgr') resmgr = OcrdResourceManager() - basedir = join(VIRTUAL_ENV, 'ocrd-resources') if location == 'virtualenv' and VIRTUAL_ENV else \ - join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ - join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ - join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \ - getcwd() + basedir = resmgr.get_resource_dir(location) is_url = url_or_name.startswith('https://') or url_or_name.startswith('http://') is_filename = Path(url_or_name).exists() find_kwargs = {'executable': executable} diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 23269ea11e..a48bfddc32 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -1,6 +1,6 @@ from pathlib import Path from os.path import join -from os import environ, listdir +from os import environ, listdir, getcwd import re from shutil import copytree from datetime import datetime @@ -16,6 +16,7 @@ from ocrd_utils.os import list_all_resources, pushd_popd from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT +from .config import load_config_file class OcrdResourceManager(): @@ -132,6 +133,18 @@ def find_resources(self, executable=None, name=None, url=None, database=None): ret.append((executable, resdict)) return ret + def get_resource_dir(self, location): + return join(VIRTUAL_ENV, 'ocrd-resources') if location == 'virtualenv' and VIRTUAL_ENV else \ + join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ + join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ + join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \ + getcwd() + + @property + def default_resource_dir(self): + config = load_config_file() + return self.get_resource_dir(config.resource_location) + def parameter_usage(self, name, usage='as-is'): if usage == 'as-is': return name From 53a591d83d361bfc42196ba678d0215249f9f86e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 20 Jan 2021 12:38:50 +0100 Subject: [PATCH 58/70] :package: v2.22.0b4 --- CHANGELOG.md | 5 +++++ ocrd_utils/setup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b152b2729..30f7062031 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.22.0b4] - 2021-01-20 + Added: * Implement file resource algorithm from OCR-D/spec#169, #559 @@ -1266,6 +1268,9 @@ Fixed Initial Release +[2.22.0b4]: ../../compare/v2.22.0b4..v2.22.0b3 +[2.22.0b3]: ../../compare/v2.22.0b3..v2.22.0b2 +[2.22.0b2]: ../../compare/v2.22.0b2..v2.22.0b1 [2.22.0b1]: ../../compare/v2.22.0b1..v2.21.0 [2.21.0]: ../../compare/v2.21.0..v2.20.2 [2.20.2]: ../../compare/v2.20.2..v2.20.1 diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index 838a7c7952..e33ff7d31c 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='2.22.0b3', + version='2.22.0b4', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown', From a05ecf4d2812df1750b894d26760e4fea3c62d9b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 20 Jan 2021 16:38:40 +0100 Subject: [PATCH 59/70] fix ocrd_config test --- tests/test_ocrd_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_ocrd_config.py b/tests/test_ocrd_config.py index 8d25dbd92e..cb29fef461 100644 --- a/tests/test_ocrd_config.py +++ b/tests/test_ocrd_config.py @@ -1,5 +1,6 @@ from tests.base import main from unittest import mock +from pathlib import Path import ocrd_utils @@ -7,7 +8,8 @@ def test_config_loading(): XDG_CONFIG_HOME_before = ocrd_utils.XDG_CONFIG_HOME with ocrd_utils.pushd_popd(tempdir=True) as tempdir: ocrd_utils.XDG_CONFIG_HOME = tempdir - with open('ocrd.yml', 'w', encoding='utf-8') as f: + Path('ocrd').mkdir() + with open('ocrd/config.yml', 'w', encoding='utf-8') as f: f.write('resource_location: cache\n') from ocrd.config import load_config_file obj = load_config_file() From 61a88452df504d291b43a4fa8c21fb7518773a98 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 20 Jan 2021 17:06:47 +0100 Subject: [PATCH 60/70] config: mkdir -p $(basename) --- ocrd/ocrd/config.py | 11 +++++++---- tests/test_ocrd_config.py | 11 ++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ocrd/ocrd/config.py b/ocrd/ocrd/config.py index fafa295dac..5cef833d0c 100644 --- a/ocrd/ocrd/config.py +++ b/ocrd/ocrd/config.py @@ -4,17 +4,20 @@ from yaml import safe_load, safe_dump from ocrd_models import OcrdConfig -from ocrd_utils import XDG_CONFIG_HOME, VERSION +from ocrd_utils import VERSION +import ocrd_utils from ocrd_validators import OcrdConfigValidator from ocrd_models.ocrd_config import DEFAULT_CONFIG -def load_config_file(): +def load_config_file(basedir=None): """ Load the configuration file """ - fpath = Path(XDG_CONFIG_HOME, 'ocrd', 'config.yml') + if not basedir: + basedir = ocrd_utils.XDG_CONFIG_HOME + fpath = Path(basedir, 'ocrd', 'config.yml') if not fpath.parent.exists(): - fpath.parent.mkdir() + fpath.parent.mkdir(parents=True) obj = DEFAULT_CONFIG if not fpath.exists(): with open(str(fpath), 'w', encoding='utf-8') as f_out: diff --git a/tests/test_ocrd_config.py b/tests/test_ocrd_config.py index cb29fef461..48ff865c6c 100644 --- a/tests/test_ocrd_config.py +++ b/tests/test_ocrd_config.py @@ -2,19 +2,16 @@ from unittest import mock from pathlib import Path -import ocrd_utils +from ocrd_utils import pushd_popd +from ocrd.config import load_config_file def test_config_loading(): - XDG_CONFIG_HOME_before = ocrd_utils.XDG_CONFIG_HOME - with ocrd_utils.pushd_popd(tempdir=True) as tempdir: - ocrd_utils.XDG_CONFIG_HOME = tempdir + with pushd_popd(tempdir=True) as tempdir: Path('ocrd').mkdir() with open('ocrd/config.yml', 'w', encoding='utf-8') as f: f.write('resource_location: cache\n') - from ocrd.config import load_config_file - obj = load_config_file() + obj = load_config_file(tempdir) assert obj.dump() == {'resource_location': 'cache'} - ocrd_utils.XDG_CONFIG_HOME = XDG_CONFIG_HOME_before if __name__ == '__main__': main(__file__) From fd8ca26e729ea132f9fe1d301e97fb6aa1f1b341 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 21 Jan 2021 15:44:35 +0100 Subject: [PATCH 61/70] :bug: resmgr: virtualenv location was missing "share" --- ocrd/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index a48bfddc32..cb6c9a52c4 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -134,7 +134,7 @@ def find_resources(self, executable=None, name=None, url=None, database=None): return ret def get_resource_dir(self, location): - return join(VIRTUAL_ENV, 'ocrd-resources') if location == 'virtualenv' and VIRTUAL_ENV else \ + return join(VIRTUAL_ENV, 'share', 'ocrd-resources') if location == 'virtualenv' and VIRTUAL_ENV else \ join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \ From a3cff9e8fd31b7d25236f44fb8b45034978528f0 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 21 Jan 2021 19:16:35 +0100 Subject: [PATCH 62/70] resmgr: show shorthand location in list-installed --- ocrd/ocrd/cli/resmgr.py | 15 ++++++++++----- ocrd/ocrd/resource_manager.py | 15 ++++++++++++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 7cbcb035d2..547d72d30e 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -22,10 +22,15 @@ config = load_config_file() -def print_resources(executable, reslist): +def print_resources(executable, reslist, resmgr): print('%s' % executable) for resdict in reslist: - print('- %s (%s)\n %s' % (resdict['name'], resdict['url'], resdict['description'])) + print('- %s %s (%s)\n %s' % ( + resdict['name'], + '@ %s' % resmgr.resource_dir_to_location(resdict['path']) if 'path' in resdict else '', + resdict['url'], + resdict['description'] + )) print() @click.group("resmgr") @@ -43,7 +48,7 @@ def list_available(executable=None): """ resmgr = OcrdResourceManager() for executable, reslist in resmgr.list_available(executable): - print_resources(executable, reslist) + print_resources(executable, reslist, resmgr) @resmgr_cli.command('list-installed') @click.option('-e', '--executable', help='Show only resources for executable EXEC', metavar='EXEC') @@ -54,7 +59,7 @@ def list_installed(executable=None): resmgr = OcrdResourceManager() ret = [] for executable, reslist in resmgr.list_installed(executable): - print_resources(executable, reslist) + print_resources(executable, reslist, resmgr) @resmgr_cli.command('download') @click.option('-n', '--any-url', help='Allow downloading/copying unregistered resources', is_flag=True) @@ -74,7 +79,7 @@ def download(any_url, overwrite, location, executable, url_or_name): """ log = getLogger('ocrd.cli.resmgr') resmgr = OcrdResourceManager() - basedir = resmgr.get_resource_dir(location) + basedir = resmgr.location_to_resource_dir(location) is_url = url_or_name.startswith('https://') or url_or_name.startswith('http://') is_filename = Path(url_or_name).exists() find_kwargs = {'executable': executable} diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index cb6c9a52c4..d74615c52c 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -85,6 +85,7 @@ def list_installed(self, executable=None): if not resdict: self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'" % (this_executable, res_name, res_filename, self.user_list)) resdict = [self.add_to_user_database(this_executable, res_filename)] + resdict[0]['path'] = res_filename reslist.append(resdict[0]) ret.append((this_executable, reslist)) return ret @@ -103,7 +104,7 @@ def add_to_user_database(self, executable, res_filename, url=None): resdict = { 'name': res_name, 'url': url if url else '???', - 'description': 'Found at %s on %s' % (res_filename, datetime.now()), + 'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()), 'version_range': '???', 'size': res_size } @@ -133,17 +134,25 @@ def find_resources(self, executable=None, name=None, url=None, database=None): ret.append((executable, resdict)) return ret - def get_resource_dir(self, location): + def location_to_resource_dir(self, location): return join(VIRTUAL_ENV, 'share', 'ocrd-resources') if location == 'virtualenv' and VIRTUAL_ENV else \ join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \ getcwd() + def resource_dir_to_location(self, resource_path): + resource_path = str(resource_path) + return 'virtualenv' if VIRTUAL_ENV and resource_path.startswith(join(VIRTUAL_ENV, 'share', 'ocrd-resources')) else \ + 'cache' if resource_path.startswith(join(XDG_CACHE_HOME, 'ocrd-resources')) else \ + 'data' if resource_path.startswith(join(XDG_DATA_HOME, 'ocrd-resources')) else \ + 'config' if resource_path.startswith(join(XDG_CONFIG_HOME, 'ocrd-resources')) else \ + resource_path + @property def default_resource_dir(self): config = load_config_file() - return self.get_resource_dir(config.resource_location) + return self.location_to_resource_dir(config.resource_location) def parameter_usage(self, name, usage='as-is'): if usage == 'as-is': From 9280ef4d708d5c8944f1dc66046b6f8f4929f506 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 22 Jan 2021 16:26:27 +0100 Subject: [PATCH 63/70] remove virtualenv, introduce /usr/local/share --- ocrd/ocrd/cli/resmgr.py | 1 - ocrd/ocrd/resource_manager.py | 10 ++++------ ocrd_models/ocrd_models/ocrd_config.py | 2 +- ocrd_utils/ocrd_utils/__init__.py | 1 - ocrd_utils/ocrd_utils/constants.py | 4 +--- ocrd_utils/ocrd_utils/os.py | 10 ++++------ ocrd_validators/ocrd_validators/ocrd_config.schema.yml | 4 ++-- repo/spec | 2 +- 8 files changed, 13 insertions(+), 21 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 547d72d30e..1eaed76c9c 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -9,7 +9,6 @@ from ocrd_utils import ( initLogging, getLogger, - VIRTUAL_ENV, RESOURCE_LOCATIONS, XDG_CACHE_HOME, XDG_CONFIG_HOME, diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index d74615c52c..d352a5f9fc 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -12,7 +12,7 @@ from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger -from ocrd_utils.constants import HOME, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME, VIRTUAL_ENV +from ocrd_utils.constants import HOME, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME from ocrd_utils.os import list_all_resources, pushd_popd from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT @@ -71,9 +71,7 @@ def list_installed(self, executable=None): # resources we know about all_executables = list(self.database.keys()) # resources in the file system - parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME]] - if VIRTUAL_ENV: - parent_dirs += [join(VIRTUAL_ENV, 'share', 'ocrd-resources')] + parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME, '/usr/local/share']] for parent_dir in parent_dirs: if Path(parent_dir).exists(): all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')] @@ -135,7 +133,7 @@ def find_resources(self, executable=None, name=None, url=None, database=None): return ret def location_to_resource_dir(self, location): - return join(VIRTUAL_ENV, 'share', 'ocrd-resources') if location == 'virtualenv' and VIRTUAL_ENV else \ + return '/usr/local/share/ocrd-resources' if location == 'system' else \ join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \ @@ -143,7 +141,7 @@ def location_to_resource_dir(self, location): def resource_dir_to_location(self, resource_path): resource_path = str(resource_path) - return 'virtualenv' if VIRTUAL_ENV and resource_path.startswith(join(VIRTUAL_ENV, 'share', 'ocrd-resources')) else \ + return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \ 'cache' if resource_path.startswith(join(XDG_CACHE_HOME, 'ocrd-resources')) else \ 'data' if resource_path.startswith(join(XDG_DATA_HOME, 'ocrd-resources')) else \ 'config' if resource_path.startswith(join(XDG_CONFIG_HOME, 'ocrd-resources')) else \ diff --git a/ocrd_models/ocrd_models/ocrd_config.py b/ocrd_models/ocrd_models/ocrd_config.py index aeb7db14f3..3930fa2d6b 100644 --- a/ocrd_models/ocrd_models/ocrd_config.py +++ b/ocrd_models/ocrd_models/ocrd_config.py @@ -4,7 +4,7 @@ import json DEFAULT_CONFIG = { - 'resource_location': 'virtualenv' + 'resource_location': 'data' } class OcrdConfig(): diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index 141cc6814b..9cfb78198e 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -80,7 +80,6 @@ LOG_FORMAT, LOG_TIMEFMT, VERSION, - VIRTUAL_ENV, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME) diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index 392011072c..4014eafa70 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -18,7 +18,6 @@ 'REGEX_FILE_ID', 'RESOURCE_LOCATIONS', 'VERSION', - 'VIRTUAL_ENV', 'XDG_CONFIG_HOME', 'XDG_DATA_HOME', 'XDG_CACHE_HOME', @@ -107,6 +106,5 @@ XDG_DATA_HOME = environ['XDG_DATA_HOME'] if 'XDG_DATA_HOME' in environ else join(HOME, '.local', 'share') XDG_CONFIG_HOME = environ['XDG_CONFIG_HOME'] if 'XDG_CONFIG_HOME' in environ else join(HOME, '.config') XDG_CACHE_HOME = environ['XDG_CACHE_HOME'] if 'XDG_CACHE_HOME' in environ else join(HOME, '.cache') -VIRTUAL_ENV = environ.get('VIRTUAL_ENV', None) -RESOURCE_LOCATIONS = ['virtualenv', 'cwd', 'cache', 'config', 'data'] +RESOURCE_LOCATIONS = ['data', 'cwd', 'cache', 'config', 'system'] diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 75ce5c6875..9b3a16fd48 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -69,11 +69,10 @@ def list_resource_candidates(executable, fname, cwd=getcwd(), is_file=False, is_ processor_path_var = '%s_PATH' % executable.replace('-', '_').upper() if processor_path_var in environ: candidates += [join(x, fname) for x in environ[processor_path_var].split(':')] - if 'VIRTUAL_ENV' in environ: - candidates.append(join(environ['VIRTUAL_ENV'], 'share', 'ocrd-resources', executable, fname)) candidates.append(join(XDG_DATA_HOME, 'ocrd-resources', executable, fname)) candidates.append(join(XDG_CONFIG_HOME, 'ocrd-resources', executable, fname)) candidates.append(join(XDG_CACHE_HOME, 'ocrd-resources', executable, fname)) + candidates.append(join('/usr/local/share/ocrd-resources', executable, fname)) if is_file: candidates = [c for c in candidates if Path(c).is_file()] if is_dir: @@ -94,13 +93,12 @@ def list_all_resources(executable): for processor_path in environ[processor_path_var].split(':'): if isdir(processor_path): candidates += list(scandir(processor_path)) - if 'VIRTUAL_ENV' in environ: - sharedir = join(environ['VIRTUAL_ENV'], 'share', 'ocrd-resources', executable) - if isdir(sharedir): - candidates += list(scandir(sharedir)) for xdgdir in [join(d, 'ocrd-resources', executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME]]: if isdir(xdgdir): candidates += list(scandir(xdgdir)) + systemdir = join('/usr/local/share/ocrd-resources', executable) + if isdir(systemdir): + candidates += list(scandir(systemdir)) return [x.path for x in candidates] # ht @pabs3 diff --git a/ocrd_validators/ocrd_validators/ocrd_config.schema.yml b/ocrd_validators/ocrd_validators/ocrd_config.schema.yml index 114216176a..25c3a92a3b 100644 --- a/ocrd_validators/ocrd_validators/ocrd_config.schema.yml +++ b/ocrd_validators/ocrd_validators/ocrd_config.schema.yml @@ -3,5 +3,5 @@ additionalProperties: true properties: resource_location: type: string - enum: ['virtualenv', 'cache', 'config', 'data', 'cwd'] - default: 'virtualenv' + enum: ['data', 'cache', 'config', 'system', 'cwd'] + default: 'data' diff --git a/repo/spec b/repo/spec index 6483f1cd55..f5d08e89b5 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 6483f1cd553783bda8cd68782ec65fc5f8910ef6 +Subproject commit f5d08e89b5b2274dfc2605b7ea20d1a22538605b From 9cb058a2cb287fc83c5e2b8b59815f039e3c61bd Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 22 Jan 2021 18:06:33 +0100 Subject: [PATCH 64/70] :fire: remove configuration file --- ocrd/ocrd/cli/resmgr.py | 5 +-- ocrd/ocrd/config.py | 31 ------------------- ocrd/ocrd/resource_manager.py | 6 ---- .../ocrd_modelfactory/__init__.py | 2 +- ocrd_models/ocrd_models/__init__.py | 1 - ocrd_models/ocrd_models/ocrd_config.py | 25 --------------- ocrd_validators/ocrd_validators/__init__.py | 2 -- ocrd_validators/ocrd_validators/constants.py | 1 - .../ocrd_validators/ocrd_config.schema.yml | 7 ----- .../ocrd_validators/ocrd_config_validator.py | 22 ------------- tests/test_ocrd_config.py | 17 ---------- 11 files changed, 2 insertions(+), 117 deletions(-) delete mode 100644 ocrd/ocrd/config.py delete mode 100644 ocrd_models/ocrd_models/ocrd_config.py delete mode 100644 ocrd_validators/ocrd_validators/ocrd_config.schema.yml delete mode 100644 ocrd_validators/ocrd_validators/ocrd_config_validator.py delete mode 100644 tests/test_ocrd_config.py diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 1eaed76c9c..400515dc2e 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -17,9 +17,6 @@ from ocrd_validators import OcrdZipValidator from ..resource_manager import OcrdResourceManager -from ..config import load_config_file - -config = load_config_file() def print_resources(executable, reslist, resmgr): print('%s' % executable) @@ -63,7 +60,7 @@ def list_installed(executable=None): @resmgr_cli.command('download') @click.option('-n', '--any-url', help='Allow downloading/copying unregistered resources', is_flag=True) @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) -@click.option('-l', '--location', help='Where to store resources', type=click.Choice(RESOURCE_LOCATIONS), default=config.resource_location, show_default=True) +@click.option('-l', '--location', help='Where to store resources', type=click.Choice(RESOURCE_LOCATIONS), default='data', show_default=True) @click.argument('executable', required=True) @click.argument('url_or_name', required=True) def download(any_url, overwrite, location, executable, url_or_name): diff --git a/ocrd/ocrd/config.py b/ocrd/ocrd/config.py deleted file mode 100644 index 5cef833d0c..0000000000 --- a/ocrd/ocrd/config.py +++ /dev/null @@ -1,31 +0,0 @@ -from pathlib import Path -from datetime import datetime - -from yaml import safe_load, safe_dump - -from ocrd_models import OcrdConfig -from ocrd_utils import VERSION -import ocrd_utils -from ocrd_validators import OcrdConfigValidator -from ocrd_models.ocrd_config import DEFAULT_CONFIG - -def load_config_file(basedir=None): - """ - Load the configuration file - """ - if not basedir: - basedir = ocrd_utils.XDG_CONFIG_HOME - fpath = Path(basedir, 'ocrd', 'config.yml') - if not fpath.parent.exists(): - fpath.parent.mkdir(parents=True) - obj = DEFAULT_CONFIG - if not fpath.exists(): - with open(str(fpath), 'w', encoding='utf-8') as f_out: - f_out.write("# Generated by OCR-D/core %s on %s\n" % (VERSION, datetime.now())) - f_out.write(safe_dump(obj)) - with open(str(fpath), 'r', encoding='utf-8') as f_in: - obj = {**obj, **safe_load(f_in.read())} - report = OcrdConfigValidator.validate(obj) - if not report.is_valid: - raise ValueError("The configuration is invalid: %s" % report.errors) - return OcrdConfig(obj) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index d352a5f9fc..7471939dbb 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -16,7 +16,6 @@ from ocrd_utils.os import list_all_resources, pushd_popd from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT -from .config import load_config_file class OcrdResourceManager(): @@ -147,11 +146,6 @@ def resource_dir_to_location(self, resource_path): 'config' if resource_path.startswith(join(XDG_CONFIG_HOME, 'ocrd-resources')) else \ resource_path - @property - def default_resource_dir(self): - config = load_config_file() - return self.location_to_resource_dir(config.resource_location) - def parameter_usage(self, name, usage='as-is'): if usage == 'as-is': return name diff --git a/ocrd_modelfactory/ocrd_modelfactory/__init__.py b/ocrd_modelfactory/ocrd_modelfactory/__init__.py index 3e802a6810..8da8b5be34 100644 --- a/ocrd_modelfactory/ocrd_modelfactory/__init__.py +++ b/ocrd_modelfactory/ocrd_modelfactory/__init__.py @@ -10,7 +10,7 @@ from PIL import Image from ocrd_utils import VERSION, MIMETYPE_PAGE -from ocrd_models import OcrdExif, OcrdConfig +from ocrd_models import OcrdExif from ocrd_models.ocrd_page import PcGtsType, PageType, MetadataType, parse __all__ = [ diff --git a/ocrd_models/ocrd_models/__init__.py b/ocrd_models/ocrd_models/__init__.py index c29414690f..9a31a2d4c7 100644 --- a/ocrd_models/ocrd_models/__init__.py +++ b/ocrd_models/ocrd_models/__init__.py @@ -2,7 +2,6 @@ APIs and schemas for various file formats in the OCR domain. """ from .ocrd_agent import OcrdAgent -from .ocrd_config import OcrdConfig from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile from .ocrd_mets import OcrdMets diff --git a/ocrd_models/ocrd_models/ocrd_config.py b/ocrd_models/ocrd_models/ocrd_config.py deleted file mode 100644 index 3930fa2d6b..0000000000 --- a/ocrd_models/ocrd_models/ocrd_config.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -Configuration file -""" -import json - -DEFAULT_CONFIG = { - 'resource_location': 'data' -} - -class OcrdConfig(): - - __slots__ = DEFAULT_CONFIG.keys() - - def __str__(self): - return 'OcrdConfig %s' % json.dumps(self.__dict__) - - def dump(self): - ret = {} - for k in DEFAULT_CONFIG.keys(): - ret[k] = getattr(self, k) - return ret - - def __init__(self, obj): - for k, v in obj.items(): - setattr(self, k, v) diff --git a/ocrd_validators/ocrd_validators/__init__.py b/ocrd_validators/ocrd_validators/__init__.py index e2ce63e743..4819017dd0 100644 --- a/ocrd_validators/ocrd_validators/__init__.py +++ b/ocrd_validators/ocrd_validators/__init__.py @@ -6,7 +6,6 @@ 'WorkspaceValidator', 'PageValidator', 'OcrdToolValidator', - 'OcrdConfigValidator', 'OcrdResourceListValidator', 'OcrdZipValidator', 'XsdValidator', @@ -18,7 +17,6 @@ from .workspace_validator import WorkspaceValidator from .page_validator import PageValidator from .ocrd_tool_validator import OcrdToolValidator -from .ocrd_config_validator import OcrdConfigValidator from .resource_list_validator import OcrdResourceListValidator from .ocrd_zip_validator import OcrdZipValidator from .xsd_validator import XsdValidator diff --git a/ocrd_validators/ocrd_validators/constants.py b/ocrd_validators/ocrd_validators/constants.py index cfb9085f5c..25d2e0e53b 100644 --- a/ocrd_validators/ocrd_validators/constants.py +++ b/ocrd_validators/ocrd_validators/constants.py @@ -20,7 +20,6 @@ OCRD_TOOL_SCHEMA = yaml.safe_load(resource_string(__name__, 'ocrd_tool.schema.yml')) RESOURCE_LIST_SCHEMA = yaml.safe_load(resource_string(__name__, 'resource_list.schema.yml')) -CONFIG_SCHEMA = yaml.safe_load(resource_string(__name__, 'ocrd_config.schema.yml')) OCRD_BAGIT_PROFILE = yaml.safe_load(resource_string(__name__, 'bagit-profile.yml')) BAGIT_TXT = 'BagIt-Version: 1.0\nTag-File-Character-Encoding: UTF-8' diff --git a/ocrd_validators/ocrd_validators/ocrd_config.schema.yml b/ocrd_validators/ocrd_validators/ocrd_config.schema.yml deleted file mode 100644 index 25c3a92a3b..0000000000 --- a/ocrd_validators/ocrd_validators/ocrd_config.schema.yml +++ /dev/null @@ -1,7 +0,0 @@ -type: object -additionalProperties: true -properties: - resource_location: - type: string - enum: ['data', 'cache', 'config', 'system', 'cwd'] - default: 'data' diff --git a/ocrd_validators/ocrd_validators/ocrd_config_validator.py b/ocrd_validators/ocrd_validators/ocrd_config_validator.py deleted file mode 100644 index 40e4c1ac65..0000000000 --- a/ocrd_validators/ocrd_validators/ocrd_config_validator.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -Validating $HOME/.config/ocrd.yml -""" -from .constants import CONFIG_SCHEMA -from .json_validator import JsonValidator - -# -# ------------------------------------------------- -# - -class OcrdConfigValidator(JsonValidator): - """ - JsonValidator validating against the ``ocrd-tool.json`` schema. - """ - - @staticmethod - def validate(obj, schema=CONFIG_SCHEMA): - """ - Validate against ``ocrd_config.schema.yml`` schema. - """ - return JsonValidator.validate(obj, schema) - diff --git a/tests/test_ocrd_config.py b/tests/test_ocrd_config.py deleted file mode 100644 index 48ff865c6c..0000000000 --- a/tests/test_ocrd_config.py +++ /dev/null @@ -1,17 +0,0 @@ -from tests.base import main -from unittest import mock -from pathlib import Path - -from ocrd_utils import pushd_popd -from ocrd.config import load_config_file - -def test_config_loading(): - with pushd_popd(tempdir=True) as tempdir: - Path('ocrd').mkdir() - with open('ocrd/config.yml', 'w', encoding='utf-8') as f: - f.write('resource_location: cache\n') - obj = load_config_file(tempdir) - assert obj.dump() == {'resource_location': 'cache'} - -if __name__ == '__main__': - main(__file__) From 7e26a072223fd153071d7adbb0c566d5471fbe22 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 22 Jan 2021 18:35:10 +0100 Subject: [PATCH 65/70] resmgr: lookup in XDG_DATA_HOME and absolute path only --- ocrd/ocrd/cli/resmgr.py | 5 +---- ocrd/ocrd/processor/base.py | 1 - ocrd/ocrd/resource_manager.py | 10 +++------- ocrd_utils/ocrd_utils/__init__.py | 1 - ocrd_utils/ocrd_utils/constants.py | 4 +--- ocrd_utils/ocrd_utils/os.py | 10 ++++------ tests/utils/test_os.py | 12 +----------- 7 files changed, 10 insertions(+), 33 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 400515dc2e..3f86b657ee 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -9,10 +9,7 @@ from ocrd_utils import ( initLogging, getLogger, - RESOURCE_LOCATIONS, - XDG_CACHE_HOME, - XDG_CONFIG_HOME, - XDG_DATA_HOME + RESOURCE_LOCATIONS ) from ocrd_validators import OcrdZipValidator diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index dbde1dba6a..2387c0a894 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -26,7 +26,6 @@ initLogging, list_resource_candidates, list_all_resources, - XDG_CACHE_HOME ) from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 7471939dbb..1136b68879 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -12,7 +12,7 @@ from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger -from ocrd_utils.constants import HOME, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME +from ocrd_utils.constants import HOME, XDG_DATA_HOME, XDG_CONFIG_HOME from ocrd_utils.os import list_all_resources, pushd_popd from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT @@ -70,7 +70,7 @@ def list_installed(self, executable=None): # resources we know about all_executables = list(self.database.keys()) # resources in the file system - parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME, '/usr/local/share']] + parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_DATA_HOME, '/usr/local/share']] for parent_dir in parent_dirs: if Path(parent_dir).exists(): all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')] @@ -133,17 +133,13 @@ def find_resources(self, executable=None, name=None, url=None, database=None): def location_to_resource_dir(self, location): return '/usr/local/share/ocrd-resources' if location == 'system' else \ - join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \ join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ - join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \ getcwd() def resource_dir_to_location(self, resource_path): resource_path = str(resource_path) return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \ - 'cache' if resource_path.startswith(join(XDG_CACHE_HOME, 'ocrd-resources')) else \ 'data' if resource_path.startswith(join(XDG_DATA_HOME, 'ocrd-resources')) else \ - 'config' if resource_path.startswith(join(XDG_CONFIG_HOME, 'ocrd-resources')) else \ resource_path def parameter_usage(self, name, usage='as-is'): @@ -181,8 +177,8 @@ def download( self, executable, url, + basedir, overwrite=False, - basedir=XDG_CACHE_HOME, name=None, resource_type='file', path_in_archive='.', diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index 9cfb78198e..1fd782f592 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -80,7 +80,6 @@ LOG_FORMAT, LOG_TIMEFMT, VERSION, - XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME) diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index 4014eafa70..121e5df612 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -20,7 +20,6 @@ 'VERSION', 'XDG_CONFIG_HOME', 'XDG_DATA_HOME', - 'XDG_CACHE_HOME', ] VERSION = get_distribution('ocrd_utils').version @@ -105,6 +104,5 @@ HOME = expanduser('~') XDG_DATA_HOME = environ['XDG_DATA_HOME'] if 'XDG_DATA_HOME' in environ else join(HOME, '.local', 'share') XDG_CONFIG_HOME = environ['XDG_CONFIG_HOME'] if 'XDG_CONFIG_HOME' in environ else join(HOME, '.config') -XDG_CACHE_HOME = environ['XDG_CACHE_HOME'] if 'XDG_CACHE_HOME' in environ else join(HOME, '.cache') -RESOURCE_LOCATIONS = ['data', 'cwd', 'cache', 'config', 'system'] +RESOURCE_LOCATIONS = ['data', 'cwd', 'system'] diff --git a/ocrd_utils/ocrd_utils/os.py b/ocrd_utils/ocrd_utils/os.py index 9b3a16fd48..027312ebbd 100644 --- a/ocrd_utils/ocrd_utils/os.py +++ b/ocrd_utils/ocrd_utils/os.py @@ -18,7 +18,7 @@ from atomicwrites import atomic_write as atomic_write_, AtomicWriter -from .constants import XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME +from .constants import XDG_DATA_HOME def abspath(url): """ @@ -70,8 +70,6 @@ def list_resource_candidates(executable, fname, cwd=getcwd(), is_file=False, is_ if processor_path_var in environ: candidates += [join(x, fname) for x in environ[processor_path_var].split(':')] candidates.append(join(XDG_DATA_HOME, 'ocrd-resources', executable, fname)) - candidates.append(join(XDG_CONFIG_HOME, 'ocrd-resources', executable, fname)) - candidates.append(join(XDG_CACHE_HOME, 'ocrd-resources', executable, fname)) candidates.append(join('/usr/local/share/ocrd-resources', executable, fname)) if is_file: candidates = [c for c in candidates if Path(c).is_file()] @@ -93,9 +91,9 @@ def list_all_resources(executable): for processor_path in environ[processor_path_var].split(':'): if isdir(processor_path): candidates += list(scandir(processor_path)) - for xdgdir in [join(d, 'ocrd-resources', executable) for d in [XDG_DATA_HOME, XDG_CONFIG_HOME, XDG_CACHE_HOME]]: - if isdir(xdgdir): - candidates += list(scandir(xdgdir)) + datadir = join(XDG_DATA_HOME, 'ocrd-resources', executable) + if isdir(datadir): + candidates += list(scandir(datadir)) systemdir = join('/usr/local/share/ocrd-resources', executable) if isdir(systemdir): candidates += list(scandir(systemdir)) diff --git a/tests/utils/test_os.py b/tests/utils/test_os.py index ca33240413..336ef595b9 100644 --- a/tests/utils/test_os.py +++ b/tests/utils/test_os.py @@ -13,19 +13,11 @@ class TestOsUtils(TestCase): def setUp(self): self.maxDiff = None self.tempdir_path = mkdtemp() - self.tempdir_venv = mkdtemp() ENV['OCRD_DUMMY_PATH'] = self.tempdir_path - self.VIRTUAL_ENV = ENV.get('VIRTUAL_ENV') - ENV['VIRTUAL_ENV'] = self.tempdir_venv def tearDown(self): rmtree(self.tempdir_path) - rmtree(self.tempdir_venv) del ENV['OCRD_DUMMY_PATH'] - if self.VIRTUAL_ENV: - ENV['VIRTUAL_ENV'] = self.VIRTUAL_ENV - else: - del ENV['VIRTUAL_ENV'] def test_resolve_basic(self): def dehomify(s): @@ -37,10 +29,8 @@ def dehomify(s): self.assertEqual(cands, [join(x, fname) for x in [ dehomify(join(getcwd(), 'ocrd-resources')), dehomify(self.tempdir_path), - dehomify(join(self.tempdir_venv, 'share', 'ocrd-resources', 'ocrd-dummy')), '$HOME/.local/share/ocrd-resources/ocrd-dummy', - '$HOME/.config/ocrd-resources/ocrd-dummy', - '$HOME/.cache/ocrd-resources/ocrd-dummy', + '/usr/local/share/ocrd-resources/ocrd-dummy', ]]) From 22fb2c62c09bd030cbc2faaf9ac2e67b22dc8b9e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 25 Jan 2021 12:26:29 +0100 Subject: [PATCH 66/70] resmgr download: be stricter about uninstalled processors --- ocrd/ocrd/cli/resmgr.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 3f86b657ee..8a13499109 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -2,8 +2,9 @@ from os import getcwd from os.path import join from pathlib import Path -import requests +from distutils.spawn import find_executable as which +import requests import click from ocrd_utils import ( @@ -56,11 +57,12 @@ def list_installed(executable=None): @resmgr_cli.command('download') @click.option('-n', '--any-url', help='Allow downloading/copying unregistered resources', is_flag=True) +@click.option('-a', '--allow-uninstalled', help="Allow installing resources for uninstalled processors", is_flag=True) @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) @click.option('-l', '--location', help='Where to store resources', type=click.Choice(RESOURCE_LOCATIONS), default='data', show_default=True) @click.argument('executable', required=True) @click.argument('url_or_name', required=True) -def download(any_url, overwrite, location, executable, url_or_name): +def download(any_url, allow_uninstalled, overwrite, location, executable, url_or_name): """ Download resource URL_OR_NAME for processor EXECUTABLE. @@ -75,6 +77,13 @@ def download(any_url, overwrite, location, executable, url_or_name): basedir = resmgr.location_to_resource_dir(location) is_url = url_or_name.startswith('https://') or url_or_name.startswith('http://') is_filename = Path(url_or_name).exists() + if not which(executable): + if not allow_uninstalled: + log.error("Executable %s is not installed. Is there a typo in the executable? " \ + "To install resources for uninstalled processor, use the -a/--allow-uninstalled flag" % executable) + sys.exit(1) + else: + log.warning("Executable %s is not installed but -a/--allow-uninstalled was given, so proceeding" % executable) find_kwargs = {'executable': executable} if url_or_name != '*': find_kwargs['url' if is_url else 'name'] = url_or_name From 2b3cb645d6dc7fe126b391983c6ae8bdfddc65a5 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 25 Jan 2021 14:20:53 +0100 Subject: [PATCH 67/70] resmgr download "*" --- ocrd/ocrd/cli/resmgr.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index 8a13499109..e1a1735211 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -61,7 +61,7 @@ def list_installed(executable=None): @click.option('-o', '--overwrite', help='Overwrite existing resources', is_flag=True) @click.option('-l', '--location', help='Where to store resources', type=click.Choice(RESOURCE_LOCATIONS), default='data', show_default=True) @click.argument('executable', required=True) -@click.argument('url_or_name', required=True) +@click.argument('url_or_name', required=False) def download(any_url, allow_uninstalled, overwrite, location, executable, url_or_name): """ Download resource URL_OR_NAME for processor EXECUTABLE. @@ -75,9 +75,14 @@ def download(any_url, allow_uninstalled, overwrite, location, executable, url_or log = getLogger('ocrd.cli.resmgr') resmgr = OcrdResourceManager() basedir = resmgr.location_to_resource_dir(location) - is_url = url_or_name.startswith('https://') or url_or_name.startswith('http://') - is_filename = Path(url_or_name).exists() - if not which(executable): + if executable != '*' and not url_or_name: + log.error("Unless EXECUTABLE ('%s') is the '*' wildcard, URL_OR_NAME is required" % executable) + sys.exit(1) + elif executable == '*': + executable = None + is_url = (url_or_name.startswith('https://') or url_or_name.startswith('http://')) if url_or_name else False + is_filename = Path(url_or_name).exists() if url_or_name else False + if executable and not which(executable): if not allow_uninstalled: log.error("Executable %s is not installed. Is there a typo in the executable? " \ "To install resources for uninstalled processor, use the -a/--allow-uninstalled flag" % executable) @@ -85,7 +90,7 @@ def download(any_url, allow_uninstalled, overwrite, location, executable, url_or else: log.warning("Executable %s is not installed but -a/--allow-uninstalled was given, so proceeding" % executable) find_kwargs = {'executable': executable} - if url_or_name != '*': + if url_or_name and url_or_name != '*': find_kwargs['url' if is_url else 'name'] = url_or_name reslist = resmgr.find_resources(**find_kwargs) if not reslist: @@ -112,7 +117,10 @@ def download(any_url, allow_uninstalled, overwrite, location, executable, url_or else: sys.exit(1) else: - for _, resdict in reslist: + for executable, resdict in reslist: + if not allow_uninstalled and not which(executable): + log.info("Skipping installing resources for %s as it is not installed. (Use -a/--allow-uninstalled to force)") + continue if resdict['url'] == '???': log.info("Cannot download user resource %s" % (resdict['name'])), continue From ac74c3da42877975912c0795bb3c925fc2ee6947 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 25 Jan 2021 17:15:03 +0100 Subject: [PATCH 68/70] Update ocrd/ocrd/resource_manager.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 1136b68879..b9c71a633d 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -70,7 +70,7 @@ def list_installed(self, executable=None): # resources we know about all_executables = list(self.database.keys()) # resources in the file system - parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_DATA_HOME, '/usr/local/share']] + parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_DATA_HOME, '/usr/local/share', getcwd()]] for parent_dir in parent_dirs: if Path(parent_dir).exists(): all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')] From 134a0c19fed8a31b880d333932efe814b60a8715 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 25 Jan 2021 17:15:35 +0100 Subject: [PATCH 69/70] Update ocrd/ocrd/resource_manager.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd/ocrd/resource_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index b9c71a633d..d72312e386 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -131,6 +131,9 @@ def find_resources(self, executable=None, name=None, url=None, database=None): ret.append((executable, resdict)) return ret + @property + def default_resource_dir(self): + return self.location_to_resource_dir('data') def location_to_resource_dir(self, location): return '/usr/local/share/ocrd-resources' if location == 'system' else \ join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \ From a5858ec4a23cf6b49c648699af049dff5df1d114 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 25 Jan 2021 17:16:47 +0100 Subject: [PATCH 70/70] allow "from ocrd import OcrdResourceManager" --- ocrd/ocrd/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd/ocrd/__init__.py b/ocrd/ocrd/__init__.py index 121ce38849..df04f93e03 100644 --- a/ocrd/ocrd/__init__.py +++ b/ocrd/ocrd/__init__.py @@ -20,3 +20,4 @@ from ocrd_validators import * from ocrd.workspace import Workspace from ocrd.workspace_backup import WorkspaceBackupManager +from ocrd.resource_manager import OcrdResourceManager