From 0352e09dfd79dc39e43074f57aaf83be3b22c3ad Mon Sep 17 00:00:00 2001
From: Alberto Soragna <alberto.soragna@gmail.com>
Date: Tue, 18 Oct 2022 07:35:54 +0900
Subject: [PATCH 1/2] various fixes to ament-copyright and add
 --allowed-licenses argument

Signed-off-by: Alberto Soragna <alberto.soragna@gmail.com>
---
 ament_copyright/ament_copyright/main.py   | 44 ++++++++++----
 ament_copyright/ament_copyright/parser.py | 72 ++++++++++++++---------
 ament_copyright/test/test_parser.py       | 10 ++--
 3 files changed, 83 insertions(+), 43 deletions(-)

diff --git a/ament_copyright/ament_copyright/main.py b/ament_copyright/ament_copyright/main.py
index 3077ca1c3..833ba11e7 100644
--- a/ament_copyright/ament_copyright/main.py
+++ b/ament_copyright/ament_copyright/main.py
@@ -29,11 +29,11 @@
 from ament_copyright import UNKNOWN_IDENTIFIER
 from ament_copyright.crawler import get_files
 from ament_copyright.parser import get_comment_block
+from ament_copyright.parser import get_copyright_information_regex
 from ament_copyright.parser import get_index_of_next_line
 from ament_copyright.parser import parse_file
 from ament_copyright.parser import scan_past_coding_and_shebang_lines
 from ament_copyright.parser import scan_past_empty_lines
-from ament_copyright.parser import search_copyright_information
 
 
 def main(argv=sys.argv[1:]):
@@ -61,6 +61,13 @@ def main(argv=sys.argv[1:]):
         default=[],
         dest='excludes',
         help='The filenames to exclude.')
+    parser.add_argument(
+        '--allowed-licenses',
+        metavar='license name',
+        nargs='+',
+        default=[],
+        dest='allowed_licenses',
+        help='List of valid licenses.')
     group = parser.add_mutually_exclusive_group()
     group.add_argument(
         '--add-missing',
@@ -112,9 +119,20 @@ def main(argv=sys.argv[1:]):
     if not filenames:
         print('No repository roots and files found')
 
+    # if user has specified a list of allowed licenses, use only those
+    if args.allowed_licenses:
+        allowed_licenses = {}
+        for license_name in args.allowed_licenses:
+            if license_name in licenses:
+                allowed_licenses[license_name] = licenses[license_name]
+            else:
+                assert False, 'Requested unknown license: ' + license_name
+    else:
+        allowed_licenses = licenses
+
     file_descriptors = {}
     for filename in sorted(filenames):
-        file_descriptors[filename] = parse_file(filename)
+        file_descriptors[filename] = parse_file(filename, allowed_licenses)
 
     if args.add_missing:
         name = names.get(args.add_missing[0], args.add_missing[0])
@@ -274,7 +292,7 @@ def add_copyright_year(file_descriptors, new_years, verbose):
         file_descriptor = file_descriptors[path]
 
         # ignore files which do not have a header
-        if not getattr(file_descriptor, 'copyright_identifier', None):
+        if not getattr(file_descriptor, 'copyright_identifiers', None):
             continue
 
         index = scan_past_coding_and_shebang_lines(file_descriptor.content)
@@ -287,10 +305,13 @@ def add_copyright_year(file_descriptors, new_years, verbose):
         else:
             block = file_descriptor.content[index:]
             block_offset = 0
-        copyright_span, years_span, name_span = search_copyright_information(block)
-        if copyright_span is None:
+
+        regex = get_copyright_information_regex()
+        match = regex.search(block)
+        if not match:
             assert False, "Could not find copyright information in file '%s'" % \
                 file_descriptor.path
+        years_span, _ = match.span(1), match.span(2)
 
         # skip if all new years are already included
         years = get_years_from_string(block[years_span[0]:years_span[1]])
@@ -311,12 +332,13 @@ def add_copyright_year(file_descriptors, new_years, verbose):
             file_descriptor.content[global_years_span[1]:]
 
         # output beginning of file for debugging
-        # index = global_years_span[0]
-        # for _ in range(3):
-        #     index = get_index_of_next_line(content, index)
-        # print('<<<')
-        # print(content[:index - 1])
-        # print('>>>')
+        if verbose:
+            index = global_years_span[0]
+            for _ in range(3):
+                index = get_index_of_next_line(content, index)
+                print('<<<')
+                print(content[:index - 1])
+                print('>>>')
 
         with open(file_descriptor.path, 'w', encoding='utf-8') as h:
             h.write(content)
diff --git a/ament_copyright/ament_copyright/parser.py b/ament_copyright/ament_copyright/parser.py
index ef19d99a7..6d2740e83 100644
--- a/ament_copyright/ament_copyright/parser.py
+++ b/ament_copyright/ament_copyright/parser.py
@@ -52,7 +52,7 @@ def read(self):
         with open(self.path, 'r', encoding='utf-8') as h:
             self.content = h.read()
 
-    def parse(self):
+    def parse(self, allowed_licenses):
         raise NotImplementedError()
 
     def identify_license(self, content, license_part, licenses=None):
@@ -104,7 +104,7 @@ def identify_copyright(self):
             else:
                 self.copyright_identifiers.append(UNKNOWN_IDENTIFIER)
 
-    def parse(self):
+    def parse(self, allowed_licenses):
         self.read()
         if not self.content:
             return
@@ -113,36 +113,48 @@ def parse(self):
         index = scan_past_coding_and_shebang_lines(self.content)
         index = scan_past_empty_lines(self.content, index)
 
-        # get first comment block without leading comment tokens
-        block, _ = get_comment_block(self.content, index)
-        copyrights, remaining_block = search_copyright_information(block)
-
-        if len(copyrights) == 0:
-            block = get_multiline_comment_block(self.content, index)
+        def parse_comment_block(block):
             copyrights, remaining_block = search_copyright_information(block)
-
-        if len(copyrights) == 0:
-            return
-
-        self.copyrights = copyrights
+            self.copyrights += copyrights
+            # if we haven't found a license yet, try to identify it in this block
+            # in case of files with multiple licenses, we only consider the first one found
+            # an example is if you copy a file with an existing license and then you prepend yours
+            if self.license_identifier == UNKNOWN_IDENTIFIER:
+                license_text = '{copyright}' + remaining_block
+                self.identify_license(license_text, 'file_headers', allowed_licenses)
+
+        # parse all single-line comment blocks for copyright information
+        tmp_index = index
+        while True:
+            block, tmp_index = get_comment_block(self.content, tmp_index)
+            if block:
+                parse_comment_block(block)
+            else:
+                break
+
+        # parse all multi-line comment blocks for copyright information
+        tmp_index = index
+        while True:
+            block, tmp_index = get_multiline_comment_block(self.content, tmp_index)
+            if block:
+                parse_comment_block(block)
+            else:
+                break
 
         self.identify_copyright()
 
-        content = '{copyright}' + remaining_block
-        self.identify_license(content, 'file_headers')
-
 
 class ContributingDescriptor(FileDescriptor):
 
     def __init__(self, path):
         super(ContributingDescriptor, self).__init__(CONTRIBUTING_FILETYPE, path)
 
-    def parse(self):
+    def parse(self, allowed_licenses):
         self.read()
         if not self.content:
             return
 
-        self.identify_license(self.content, 'contributing_files')
+        self.identify_license(self.content, 'contributing_files', allowed_licenses)
 
 
 class LicenseDescriptor(FileDescriptor):
@@ -150,15 +162,15 @@ class LicenseDescriptor(FileDescriptor):
     def __init__(self, path):
         super(LicenseDescriptor, self).__init__(LICENSE_FILETYPE, path)
 
-    def parse(self):
+    def parse(self, allowed_licenses):
         self.read()
         if not self.content:
             return
 
-        self.identify_license(self.content, 'license_files')
+        self.identify_license(self.content, 'license_files', allowed_licenses)
 
 
-def parse_file(path):
+def parse_file(path, allowed_licenses):
     filetype = determine_filetype(path)
     if filetype == SOURCE_FILETYPE:
         d = SourceDescriptor(path)
@@ -168,7 +180,7 @@ def parse_file(path):
         d = LicenseDescriptor(path)
     else:
         return None
-    d.parse()
+    d.parse(allowed_licenses)
     return d
 
 
@@ -180,9 +192,7 @@ def determine_filetype(path):
     return SOURCE_FILETYPE
 
 
-def search_copyright_information(content):
-    if content is None:
-        return [], content
+def get_copyright_information_regex():
     # regex for matching years or year ranges (yyyy-yyyy) separated by colons
     year = r'\d{4}'
     year_range = '%s-%s' % (year, year)
@@ -191,6 +201,13 @@ def search_copyright_information(content):
               r'copyright(?:\s+\(c\))?\s+(%s(?:,\s*%s)*),?\s+([^\n\r]+)$' % \
         (year_or_year_range, year_or_year_range)
     regex = re.compile(pattern, re.DOTALL | re.MULTILINE | re.IGNORECASE)
+    return regex
+
+
+def search_copyright_information(content):
+    if content is None:
+        return [], content
+    regex = get_copyright_information_regex()
 
     copyrights = []
     while True:
@@ -297,6 +314,7 @@ def get_multiline_comment_block(content, index):
         start_match = start_regex.search(content, index)
         if not start_match:
             continue
+        comment_token = start_match.group(1)
         start_index = start_match.start(1)
 
         # find the first match of the comment end token
@@ -323,8 +341,8 @@ def get_multiline_comment_block(content, index):
             # Single-line header does not have a common prefix to strip out
             lines = prefixed_lines
 
-        return '\n'.join(lines)
-    return None
+        return '\n'.join(lines), start_index + len(comment_token) + 1
+    return None, index
 
 
 def scan_past_empty_lines(content, index):
diff --git a/ament_copyright/test/test_parser.py b/ament_copyright/test/test_parser.py
index f36453e92..d42230da3 100644
--- a/ament_copyright/test/test_parser.py
+++ b/ament_copyright/test/test_parser.py
@@ -271,7 +271,7 @@ def test_get_comment_block_slashes2():
     """
     index = 0
     index = scan_past_empty_lines(commented_content, index)
-    block = get_multiline_comment_block(commented_content, index)
+    block, _ = get_multiline_comment_block(commented_content, index)
     assert block is not None
     assert block == 'ddd'
 
@@ -320,7 +320,7 @@ def test_get_multiline_comment_block_cstyle():
     """
     index = 0
     index = scan_past_empty_lines(commented_content, index)
-    block = get_multiline_comment_block(commented_content, index)
+    block, _ = get_multiline_comment_block(commented_content, index)
     assert block is not None
     assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
 
@@ -339,7 +339,7 @@ def test_get_multiline_comment_block_cstyle2():
     """
     index = 0
     index = scan_past_empty_lines(commented_content, index)
-    block = get_multiline_comment_block(commented_content, index)
+    block, _ = get_multiline_comment_block(commented_content, index)
     assert block is not None
     assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
 
@@ -355,7 +355,7 @@ def test_get_multiline_comment_block_xmlstyle():
     """
     index = 0
     index = scan_past_empty_lines(commented_content, index)
-    block = get_multiline_comment_block(commented_content, index)
+    block, _ = get_multiline_comment_block(commented_content, index)
     assert block is not None
     assert block == '\n'.join(['aaa', 'bbb', 'ccc'])
 
@@ -371,6 +371,6 @@ def test_get_multiline_comment_block_xmlstyle_prefixed():
     """
     index = 0
     index = scan_past_empty_lines(commented_content, index)
-    block = get_multiline_comment_block(commented_content, index)
+    block, _ = get_multiline_comment_block(commented_content, index)
     assert block is not None
     assert block == '\n'.join(['aaa', 'bbb', 'ccc'])

From b85188ebe68461391a12ffdcb058fbb77de3c920 Mon Sep 17 00:00:00 2001
From: Alberto Soragna <alberto.soragna@gmail.com>
Date: Tue, 18 Oct 2022 07:40:22 +0900
Subject: [PATCH 2/2] remove un-needed operation

Signed-off-by: Alberto Soragna <alberto.soragna@gmail.com>
---
 ament_copyright/ament_copyright/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ament_copyright/ament_copyright/main.py b/ament_copyright/ament_copyright/main.py
index 833ba11e7..fa8dc9846 100644
--- a/ament_copyright/ament_copyright/main.py
+++ b/ament_copyright/ament_copyright/main.py
@@ -311,7 +311,7 @@ def add_copyright_year(file_descriptors, new_years, verbose):
         if not match:
             assert False, "Could not find copyright information in file '%s'" % \
                 file_descriptor.path
-        years_span, _ = match.span(1), match.span(2)
+        years_span = match.span(1)
 
         # skip if all new years are already included
         years = get_years_from_string(block[years_span[0]:years_span[1]])