From 8cdd0a9821a46c463728a6565ad1c2688fdb209f Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Tue, 3 Mar 2026 14:05:45 +0100
Subject: [PATCH 01/14] alternate parser

---
 .gitignore              |    8 +
 src/bin/main.rs         |   20 +-
 src/phases/alt_parse.rs | 1330 +++++++++++++++++++++++++++++++++++++++
 src/phases/mod.rs       |    2 +
 4 files changed, 1359 insertions(+), 1 deletion(-)
 create mode 100644 src/phases/alt_parse.rs
diff --git a/.gitignore b/.gitignore
index 7959c06..2733b64 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,14 @@ data/
 ### Scyros ###
 ghtokens.csv
 !tests/data
+outputs/
 
 *.zip
+// Negation pattern in .gitignore
+// The '!' prefix in .gitignore is used to negate a previous pattern.
+// It tells Git to NOT ignore files matching that pattern, even if they
+// would have been ignored by an earlier rule.
+// Example: If '*.tar.gz' ignores all .tar.gz files, then '!tests/data'
+// would un-ignore the 'tests/data' file/directory, ensuring it is tracked
+// by Git despite matching a previous ignore rule.
 *.tar.gz
\ No newline at end of file
diff --git a/src/bin/main.rs b/src/bin/main.rs
index d736477..ea61c47 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -14,7 +14,7 @@
 
 use clap::{Arg, ArgAction, Command};
 use scyros::phases::{
-    download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages,
+    alt_parse, download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages,
     filter_metadata, forks, ids, languages, metadata, parse, pull_request,
 };
 use scyros::utils::error::*;
@@ -36,6 +36,7 @@ fn cli() -> Command {
         .subcommand(duplicate_files::cli())
         .subcommand(parse::cli())
         .subcommand(extract_benchmarks::cli())
+        .subcommand(alt_parse::cli())
         .arg(
             Arg::new("debug")
                 .long("debug")
@@ -213,6 +214,23 @@ fn main() {
                                     &mut logger,
                                 )
                             }
+                            else if subcommand == alt_parse::cli().get_name() {
+                                alt_parse::run(
+                                    cli_subargs.get_one::<String>("input").unwrap(),
+                                    cli_subargs.get_one::<String>("output").map(|x| x.as_str()),
+                                    cli_subargs.get_one::<String>("logs").map(|x| x.as_str()),
+                                    cli_subargs
+                                        .get_many::<String>("lang")
+                                        .map(|v|
+                                        v.map(|s| s.as_str())
+                                        .collect::<Vec<&str>>()),
+                                    cli_subargs.get_one::<String>("failures").unwrap(),
+                                    *cli_subargs.get_one::<usize>("threads").unwrap(),
+                                    *cli_subargs.get_one::<u64>("seed").unwrap(),
+                                    cli_subargs.get_flag("force"),
+                                    &mut logger,
+                                )
+                            }
                             else {
                                 Error::new(&format!("The subcommand {} is not available. Run the program with the --help flag to see the list of subcommands", subcommand)).to_res()
                             }
diff --git a/src/phases/alt_parse.rs b/src/phases/alt_parse.rs
new file mode 100644
index 0000000..ed5f6dc
--- /dev/null
+++ b/src/phases/alt_parse.rs
@@ -0,0 +1,1330 @@
+// Copyright 2025 Andrea Gilot
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Parse all the files in the input file and extract the functions whose body contains one of the provided keywords.
+//! All parsed files repositories are logged in a CSV file where statistics about the functions are stored.
+//! These statistics include the number of lines of code, the number of words, the number of keywords matched, the number of conditional statements, loops,
+//! and the maximum nesting level of these statements.
+//! The name of the log file is the same as the input file with the extension `.functions`.
+//! The functions are stored in a folder with the same name as the file and the extension `_functions`.
+//! The supported languages are C, C++, Java, Python and Fortran.
+
+use clap::ArgAction;
+use clap::{Arg, Command};
+use indicatif::ProgressBar;
+use polars::prelude::*;
+use rand::rngs::StdRng;
+use rand::seq::SliceRandom as _;
+use rand::SeedableRng;
+
+use std::iter::FromIterator as _;
+use std::vec;
+use std::{collections::HashSet, fmt::Write, io::Write as IOWrite, sync::Mutex};
+use tree_sitter::{Language, Node, Parser, Tree};
+
+use crate::utils::error::*;
+use crate::utils::fs::*;
+use crate::utils::regex::*;
+use crate::utils::{csv::*, logger::Logger};
+
+/// Command line arguments parsing.
+pub fn cli() -> Command {
+    Command::new("alt_parse")
+        .about("Parse all the files in the dataset and extract functions")
+        .long_about(
+            "Parse all the files in the input file and extract functions \
+            All parsed files repositories are logged in a CSV file where statistics about the functions are stored. \
+            These statistics include the number of lines of code, the number of words, the number of keywords matched, the number of conditional statements, loops,
+            and the maximum nesting level of these statements.\n\
+            The name of the log file is the same as the input file with the extension \".functions\". \
+            The functions are stored in a folder with the same name as the file and the extension \"_functions\".\n\
+            The supported languages are C, C++, Java, Python and Fortran."
+        )
+        .disable_version_flag(true)
+        .arg(
+            Arg::new("input")
+                .short('i')
+                .long("input")
+                .value_name("INPUT_FILE.csv")
+                .help("Path to the input csv file to use. It must be a valid CSV file where the first column is the path to the file and the \
+                       second column is the extension of the file. Other columns are ignored.")
+                .required(true)
+        )
+        .arg(
+            Arg::new("output")
+                .short('o')
+                .long("output")
+                .value_name("OUTPUT_FILE.csv")
+                .help("Path to the output csv file storing the functions statistics.")
+                .required(false),
+        )
+        .arg(
+            Arg::new("logs")
+                .short('l')
+                .long("logs")
+                .value_name("LOGS_FOLDER")
+                .help("Path to the folder where the logs are stored. The default is the current folder.")
+                .required(false),
+        )
+/*         .arg(
+            Arg::new("keywords")
+                .short('k')
+                .long("keywords")
+                .num_args(1..)
+                .action(ArgAction::Append)
+                .value_name("KEYWORDS_FILES.json")
+                .help("List of files containing the list of extensions and keywords to use. The files must be in JSON format.\n\
+                    The extensions should be written without the period (`java` instead of `.java`). The files must have the following structure:\n    \
+                        {\n        \
+                            \"extensions\": {\n            \
+                                \"ext1\": [\"kw11\", \"kw12\", ...],\n            \
+                                \"ext2\": [\"kw21\", \"kw22\", ...],\n            \
+                                ...\n        \
+                            },\n        \
+                            \"keywords\": [\"kw1\", \"kw2\", ...]\n    \
+                        }")
+                .required(true)
+        ) */
+        .arg(
+            Arg::new("lang")
+                .long("lang")
+                .num_args(1..)
+                .action(ArgAction::Append)
+                .value_name("LANGUAGES")
+                .help("List of languages to parse. The supported languages are C, C++, C#, Fortran, Go, Java, Python and Typescript.")
+                .required(false)
+        )
+        .arg(
+            Arg::new("force")
+                .short('f')
+                .long("force")
+                .help("Override the output file if it already exists.")
+                .default_value("false")
+                .action(ArgAction::SetTrue),
+        )
+        .arg(
+            Arg::new("threads")
+                .short('n')
+                .help("Number of threads to use.")
+                .default_value("1")
+                .value_parser(clap::value_parser!(usize))
+        )
+        .arg(
+            Arg::new("seed")
+                .short('s')
+                .long("seed")
+                .value_name("SEED")
+                .help("Seed used to randomly shuffle the input file.")
+                .default_value("8155495201244430235")
+                .value_parser(clap::value_parser!(u64)),
+        )
+        .arg(
+            Arg::new("failures")
+            .long("failures")
+            .value_name("POLICY")
+            .help("Failure policy when a file or a function has a parsing error.\n\
+            ignore: continue parsing\n\
+            skip-file: replace the file statistics with an error row in the output file, does not extract any function from the file\n\
+            skip-function: replace the function statistics with an error row in the output file\n\
+            abort: stop the program")
+            .default_value("ignore")
+            .value_parser(["ignore", "skip-file", "skip-function", "abort"]),
+        )
+}
+
+/// Simplified version that extracts all functions without caring about keywords or complex metrics.
+/// This version only extracts function definitions and basic statistics.
+///
+/// # Arguments
+///
+/// * `input_file` - Path to the input csv file to use.
+/// * `output_file` - Path to the output csv file storing the functions statistics.
+/// * `logs_file` - Path to the output csv file storing the files statistics.
+/// * `threads` - Number of threads to use.
+///
+/// All functions are extracted and logged in a CSV file with basic statistics only:
+/// id, path, name, position, language, loc, words, params, parse_error
+///
+pub fn run(
+    input_path: &str,
+    output_path: Option<&str>,
+    logs_path: Option<&str>,
+    opt_languages: Option<Vec<&str>>,
+    fail_policy: &str,
+    threads: usize,
+    seed: u64,
+    force: bool,
+    logger: &mut Logger,
+) -> Result<(), Error> {
+    let supported_languages: HashSet<&'static str> = vec![
+        "c",
+        "c++",
+        "c#",
+        "java",
+        "python",
+        "fortran",
+        "typescript",
+        "go",
+        "scala",
+    ]
+    .into_iter()
+    .collect::<HashSet<_>>();
+
+    let languages: Vec<&str> = match opt_languages {
+        Some(l) => {
+            for lang in l.iter() {
+                if !supported_languages.contains(lang) {
+                    Error::new(&format!("Unsupported language: {}", lang)).to_res()?;
+                }
+            }
+            l
+        }
+        None => {
+            logger.log("No language specified, using all supported languages")?;
+            supported_languages.into_iter().collect()
+        }
+    };
+
+    let languages_series = Series::new(
+        "language_filter".into(),
+        languages
+            .iter()
+            .map(|x| x.to_string())
+            .collect::<Vec<String>>(),
+    );
+
+    let default_output_path: String = format!("{}.functions_simple.csv", input_path);
+    let output_path: &str = output_path.unwrap_or(&default_output_path);
+
+    match check_path(output_path) {
+        Ok(_) => {
+            if force {
+                logger.log(&format!("Overriding existing file: {}", output_path))?;
+            } else {
+                Error::new(&format!(
+                    "File {} already exists. Use --force to override it.",
+                    output_path
+                ))
+                .to_res()?;
+            }
+        }
+        Err(_) => logger.log(&format!("Creating new file: {}", output_path))?,
+    }
+
+    let default_logs_path: String = format!("{}.function_logs_simple.csv", input_path);
+    let logs_path: &str = logs_path.unwrap_or(&default_logs_path);
+
+    match check_path(logs_path) {
+        Ok(_) => {
+            if force {
+                logger.log(&format!("Overriding existing file: {}", logs_path))?;
+            } else {
+                Error::new(&format!(
+                    "File {} already exists. Use --force to override it.",
+                    logs_path
+                ))
+                .to_res()?;
+            }
+        }
+        Err(_) => logger.log(&format!("Creating new file: {}", logs_path))?,
+    }
+
+    let mut input_file = open_csv(
+        input_path,
+        Some(Schema::from_iter(vec![
+            Field::new("id".into(), DataType::UInt32),
+            Field::new("name".into(), DataType::String),
+            Field::new("language".into(), DataType::String),
+        ])),
+        Some(vec!["id", "name", "language"]),
+    )?;
+
+    let n_files_before = input_file.height();
+
+    logger.log(&format!(
+        "  {} files found in the input file, filtering by selected languages",
+        n_files_before
+    ))?;
+
+    input_file = map_err(
+        input_file
+            .lazy()
+            .filter(col("language").is_in(lit(languages_series)))
+            .collect(),
+        "Error filtering languages",
+    )?;
+
+    let n_files = input_file.height();
+
+    logger.log(&format!(
+        "  {} files found after filtering ({:.2} %)",
+        n_files,
+        if n_files_before == 0 {
+            0
+        } else {
+            n_files / n_files_before * 100
+        }
+    ))?;
+
+    logger.log_seed(seed)?;
+
+    let mut shuffled_idx = (0..input_file.height()).collect::<Vec<usize>>();
+
+    logger.log_completion("Loading files in random order", || {
+        let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
+        shuffled_idx.shuffle(&mut rng);
+        Ok(())
+    })?;
+
+    let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
+        let row = input_file.get_row(idx).unwrap().0;
+        match (row[0].clone(), row[1].clone(), row[2].clone()) {
+            (AnyValue::UInt32(id), AnyValue::String(path), AnyValue::String(lang)) => Ok((
+                id,
+                path.replace("-was_comma-", ",")
+                    .replace("-was_quote-", "\""),
+                lang,
+            )),
+            _ => Err(idx),
+        }
+    });
+
+    const OUTPUT_COLS: usize = 9;
+    const LOGS_COLS: usize = 5;
+
+    let word_counter: Matcher = Matcher::words_matcher();
+
+    let mut output_file = CSVFile::new(output_path, FileMode::Overwrite)?;
+
+    let header: [&str; OUTPUT_COLS] = [
+        "id",
+        "path",
+        "name",
+        "position",
+        "language",
+        "loc",
+        "words",
+        "params",
+        "parse_error",
+    ];
+
+    output_file.write_header(&header)?;
+
+    let mut logs_file = CSVFile::new(logs_path, FileMode::Overwrite)?;
+
+    let logs_header: [&str; LOGS_COLS] = ["id", "name", "language", "functions", "parse_error"];
+
+    logs_file.write_header(&logs_header)?;
+
+    let iter = Mutex::new(shuffled_rows.into_iter());
+
+    let (tx, rx) =
+        crossbeam_channel::unbounded::<Option<Result<(String, Option<String>), Error>>>();
+
+    map_err_debug(
+        crossbeam::thread::scope(|s| {
+            for _ in 0..threads {
+                s.spawn(|_| {
+                    let my_tx = tx.clone();
+                    loop {
+                        let next_item: Option<Result<(u32, String, &str), usize>> = {
+                            let mut iter_guard = iter.lock().unwrap();
+                            iter_guard.next()
+                        };
+
+                        match next_item {
+                            Some(row) => match row {
+                                Ok((project_id, file_name, language)) => match analyze_file_simple(
+                                    project_id,
+                                    &file_name,
+                                    language,
+                                    fail_policy,
+                                    &word_counter,
+                                ) {
+                                    Ok(s) => {
+                                        my_tx.send(Some(Ok(s))).unwrap();
+                                    }
+                                    Err(e) => {
+                                        my_tx.send(Some(e.to_res())).unwrap();
+                                        break;
+                                    }
+                                },
+                                Err(row_nr) => {
+                                    let _ = my_tx.send(Some(
+                                        Error::new(&format!("Could not parse row {}", row_nr))
+                                            .to_res(),
+                                    ));
+                                }
+                            },
+                            None => {
+                                my_tx.send(None).unwrap();
+                                break;
+                            }
+                        }
+                    }
+                });
+            }
+
+            let mut ended_threads = 0;
+
+            let progress = ProgressBar::new(n_files as u64);
+            progress.set_style(map_err(
+                indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%"),
+                "Invalid progress bar style",
+            )?);
+
+            while let Ok(msg) = rx.recv() {
+                match msg {
+                    Some(Ok((output, opt_log))) => {
+                        map_err(
+                            write!(&mut output_file, "{}", output),
+                            &format!("Error writing {}", output_path),
+                        )?;
+                        if let Some(log) = opt_log {
+                            map_err(
+                                writeln!(&mut logs_file, "{}", log),
+                                &format!("Error writing {}", logs_path),
+                            )?;
+                        }
+                        progress.inc(1);
+                    }
+                    Some(Err(e)) => {
+                        e.chain("Error in child thread").to_res::<()>()?;
+                    }
+                    None => {
+                        ended_threads += 1;
+                        if ended_threads == threads {
+                            break;
+                        }
+                    }
+                }
+            }
+            progress.finish();
+            Ok(())
+        }),
+        "Error in one of the threads",
+    )?
+}
+
+/// Simplified version of analyze_file that extracts all functions without keyword filtering.
+fn analyze_file_simple(
+    project_id: u32,
+    path: &str,
+    language: &str,
+    fail_policy: &str,
+    word_counter: &Matcher,
+) -> Result<(String, Option<String>), Error> {
+    match language_to_grammar(language) {
+        Some(grammar) => {
+            let mut parser: Parser = Parser::new();
+            map_err(parser.set_language(&grammar.lang), "Cannot load grammar")?;
+            match load_file(path, 1024 * 1024 * 1024)? {
+                Ok(source_code) => {
+                    let target_folder: String = format!("{}.functions_simple", path);
+                    create_dir(&target_folder)?;
+
+                    let tree: Tree = ok_or_else(
+                        parser.parse(&source_code, None),
+                        &format!("Error parsing file {}", path),
+                    )?;
+
+                    let file_has_parse_error: bool = tree.root_node().has_error();
+
+                    if file_has_parse_error && fail_policy == "skip-file" {
+                        Ok((String::new(), None))
+                    } else if file_has_parse_error && fail_policy == "abort" {
+                        Error::new(&format!("Parse error in file {}", path)).to_res()
+                    } else {
+                        let root: Node<'_> = tree.root_node();
+                        let (output, total_functions) = extract_functions_simple(
+                            project_id,
+                            &root,
+                            &target_folder,
+                            language,
+                            &grammar,
+                            &source_code,
+                            fail_policy,
+                            word_counter,
+                        )?;
+
+                        let error_position: String = if file_has_parse_error {
+                            position_to_string(find_first_error_position(&root))
+                        } else {
+                            "none".to_string()
+                        };
+
+                        Ok((
+                            output,
+                            Some(format!(
+                                "{},{},{},{},{}",
+                                project_id,
+                                path.replace(",", "-was_comma-")
+                                    .replace("\"", "-was_quote-"),
+                                language,
+                                total_functions,
+                                error_position,
+                            )),
+                        ))
+                    }
+                }
+
+                Err(_) => Ok((
+                    String::new(),
+                    Some(format!(
+                        "{},{},{},-1,{}",
+                        project_id,
+                        path.replace(",", "-was_comma-")
+                            .replace("\"", "-was_quote-"),
+                        language,
+                        "none",
+                    )),
+                )),
+            }
+        }
+        None => Error::new(&format!("Unsupported language: {}", language)).to_res(),
+    }
+}
+
+/// Simplified version of extract_functions that extracts all functions without keyword filtering.
+fn extract_functions_simple(
+    project_id: u32,
+    root: &Node,
+    target_folder: &str,
+    language: &str,
+    grammar: &Grammar,
+    source: &[u8],
+    fail_policy: &str,
+    word_counter: &Matcher,
+) -> Result<(String, usize), Error> {
+    let mut builder: String = String::new();
+    let mut functions: usize = 0;
+
+    let mut call_stack: Vec<Node> = Vec::new();
+    call_stack.push(*root);
+    let mut cursor = root.walk();
+
+    while let Some(node) = call_stack.pop() {
+        if grammar.function_nodes.contains(node.kind()) {
+            let has_error: bool = node.has_error();
+
+            if (has_error && fail_policy == "skip-function")
+                || (language == "java" && find_fields(&node, "body").is_empty())
+            {
+                continue;
+            } else {
+                let function_source_code: &[u8] = node_source_code(&node, source);
+                let function_position: (usize, usize) = (
+                    node.start_position().row + 1,
+                    node.start_position().column + 1,
+                );
+
+                let error_position: String = if has_error {
+                    position_to_string(find_first_error_position(&node).map(|(row, col)| {
+                        let error_row = row - function_position.0 + 1;
+                        if row == function_position.0 {
+                            (error_row, col - function_position.1 + 1)
+                        } else {
+                            (error_row, col)
+                        }
+                    }))
+                } else {
+                    "none".to_string()
+                };
+
+                let function_code_with_strings: &Vec<u8> =
+                    &remove_kind_from_source(function_source_code, &node, &grammar.comment_nodes);
+                /*
+                let tree_without_comments: Tree = ok_or_else(
+                    parser.parse(function_code_with_strings, None),
+                    &format!(
+                        "Error parsing code for function {}/{}",
+                        target_folder, functions
+                    ),
+                )?;
+
+                let function_code = &remove_kind_from_source(
+                    function_code_with_strings,
+                    &tree_without_comments.root_node(),
+                    &grammar.string_literal_nodes,
+                ); */
+
+                let function_path: String = format!(
+                    "{}/{}-{}",
+                    target_folder, function_position.0, function_position.1
+                );
+
+                map_err(
+                    std::fs::write(&function_path, function_source_code),
+                    &format!("Cannot write function code to {}", function_path),
+                )?;
+
+                let params_vec: Vec<Node<'_>> =
+                    find_first_node_of_kind(&node, &grammar.param_seq_nodes, true);
+
+                let mut name: String = String::from_utf8_lossy(
+                    find_first_field(&node, grammar.name_field)
+                        .map(|n| node_source_code(&n, source))
+                        .unwrap_or(b""),
+                )
+                .to_string();
+                if let Some(idx) = name.find('(') {
+                    name.truncate(idx);
+                }
+                name = name.chars().filter(|c| !c.is_whitespace()).collect();
+
+                let mut n_param: usize = 0;
+                for params in params_vec {
+                    n_param += count_nodes_of_kind(&params, &grammar.param_nodes).0;
+                }
+
+                map_err(
+                    writeln!(
+                        &mut builder,
+                        "{},{},{},{},{},{},{},{},{}",
+                        project_id,
+                        &function_path
+                            .replace(",", "-was_comma-")
+                            .replace("\"", "-was_quote-"),
+                        name.replace(",", "-was_comma-")
+                            .replace("\"", "-was_quote-"),
+                        position_to_string(Some(function_position)),
+                        language,
+                        count_text_lines(function_code_with_strings),
+                        word_counter.count_matches_in_text(function_code_with_strings),
+                        n_param,
+                        error_position,
+                    ),
+                    &format!("Error writing function statistics of {}", function_path),
+                )?;
+
+                functions += 1;
+            }
+        } else {
+            for c in node
+                .children(&mut cursor)
+                .collect::<Vec<_>>()
+                .into_iter()
+                .rev()
+            {
+                call_stack.push(c);
+            }
+        }
+    }
+    Ok((builder, functions))
+}
+
+/// Returns the source code of a node in the parse tree
+///
+/// # Arguments
+///
+/// * `n` - The node to extract the source code from.
+/// * `source` - The source code of the whole file.
+fn node_source_code<'a>(n: &Node, source: &'a [u8]) -> &'a [u8] {
+    &source[n.start_byte()..n.end_byte()]
+}
+
+/// Grammar of a programming language.
+#[allow(dead_code)]
+struct Grammar {
+    /// The programming language the grammar belongs to.
+    lang: Language,
+
+    /// Nodes representing comments.
+    comment_nodes: HashSet<&'static str>,
+
+    /// Nodes representing string literals.
+    string_literal_nodes: HashSet<&'static str>,
+
+    /// Nodes representing loops.
+    loop_nodes: HashSet<&'static str>,
+
+    /// Nodes representing conditional statements.
+    cond_nodes: HashSet<&'static str>,
+
+    /// Nodes representing functions or methods.
+    function_nodes: HashSet<&'static str>,
+
+    /// Nodes representing function or method calls.
+    function_call_nodes: HashSet<&'static str>,
+
+    /// Nodes representing a sequence of parameters of a function or method.  
+    param_seq_nodes: HashSet<&'static str>,
+
+    /// Nodes representing a parameter of a function or method.
+    param_nodes: HashSet<&'static str>,
+
+    /// The field name of the parameter type.
+    param_type_field: Option<&'static str>,
+
+    /// The field name of the function or method name.
+    name_field: &'static str,
+}
+
+/// Returns the grammar for the C programming language.
+fn c_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_c::LANGUAGE.into(),
+        comment_nodes: vec!["comment"].into_iter().collect(),
+        string_literal_nodes: vec!["string_literal"].into_iter().collect(),
+        loop_nodes: vec!["for_statement", "while_statement", "do_statement"]
+            .into_iter()
+            .collect(),
+        cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"]
+            .into_iter()
+            .collect(),
+        function_nodes: vec!["function_definition"].into_iter().collect(),
+        function_call_nodes: vec!["call_expression"].into_iter().collect(),
+        param_seq_nodes: vec!["parameter_list"].into_iter().collect(),
+        param_nodes: vec!["parameter_declaration"].into_iter().collect(),
+        param_type_field: Some("type"),
+        name_field: "declarator",
+    }
+}
+
+/// Returns the grammar for the C++ programming language.
+fn cpp_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_cpp::LANGUAGE.into(),
+        comment_nodes: vec!["comment"].into_iter().collect(),
+        string_literal_nodes: vec!["string_literal"].into_iter().collect(),
+        loop_nodes: vec!["for_range_loop", "for_statement", "while_statement"]
+            .into_iter()
+            .collect(),
+        cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"]
+            .into_iter()
+            .collect(),
+        function_nodes: vec!["function_definition", "template_declaration"]
+            .into_iter()
+            .collect(),
+        function_call_nodes: vec!["call_expression"].into_iter().collect(),
+        param_seq_nodes: vec!["parameter_list"].into_iter().collect(),
+        param_nodes: vec!["parameter_declaration", "variadic_parameter_declaration"]
+            .into_iter()
+            .collect(),
+        param_type_field: Some("type"),
+        name_field: "declarator",
+    }
+}
+
+/// Returns the grammar for the C# programming language.
+fn cs_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_c_sharp::LANGUAGE.into(),
+        comment_nodes: vec!["comment"].into_iter().collect(),
+        string_literal_nodes: vec![
+            "string_literal",
+            "verbatim_string_literal",
+            "raw_string_literal",
+        ]
+        .into_iter()
+        .collect(),
+        loop_nodes: vec!["for_statement", "while_statement", "do_statement"]
+            .into_iter()
+            .collect(),
+        cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"]
+            .into_iter()
+            .collect(),
+        function_nodes: vec![
+            "method_declaration",
+            "constructor_declaration",
+            "operator_declaration",
+        ]
+        .into_iter()
+        .collect(),
+        function_call_nodes: vec!["invocation_expression"].into_iter().collect(),
+        param_seq_nodes: vec!["parameter_list"].into_iter().collect(),
+        param_nodes: vec!["parameter"].into_iter().collect(),
+        param_type_field: Some("type"),
+        name_field: "name",
+    }
+}
+
+/// Returns the grammar for the TypeScript programming language.
+fn ts_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
+        comment_nodes: vec!["comment"].into_iter().collect(),
+        string_literal_nodes: vec!["string_fragment"].into_iter().collect(),
+        loop_nodes: vec!["for_statement", "for_in_statement", "while_statement"]
+            .into_iter()
+            .collect(),
+        cond_nodes: vec!["if_statement", "switch_statement", "ternary_expression"]
+            .into_iter()
+            .collect(),
+        function_nodes: vec!["function_declaration", "method_definition"]
+            .into_iter()
+            .collect(),
+        function_call_nodes: vec![
+            "new_expression",
+            "call_expression",
+            "decorator_call_expression",
+        ]
+        .into_iter()
+        .collect(),
+        param_seq_nodes: vec!["formal_parameters"].into_iter().collect(),
+        param_nodes: vec!["required_parameter", "optional_parameter"]
+            .into_iter()
+            .collect(),
+        param_type_field: Some("type"),
+        name_field: "name",
+    }
+}
+
+/// Returns the grammar for the Go programming language.
+fn go_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_go::LANGUAGE.into(),
+        comment_nodes: vec!["comment"].into_iter().collect(),
+        string_literal_nodes: vec!["raw_string_literal", "interpreted_string_literal"]
+            .into_iter()
+            .collect(),
+        loop_nodes: vec!["for_statement"].into_iter().collect(),
+        cond_nodes: vec![
+            "if_statement",
+            "type_switch_statement",
+            "expression_switch_statement",
+        ]
+        .into_iter()
+        .collect(),
+        function_nodes: vec!["function_declaration", "method_declaration"]
+            .into_iter()
+            .collect(),
+        function_call_nodes: vec!["call_expression"].into_iter().collect(),
+        param_seq_nodes: vec!["parameter_list"].into_iter().collect(),
+        param_nodes: vec!["parameter_declaration", "variadic_parameter_declaration"]
+            .into_iter()
+            .collect(),
+        param_type_field: Some("type"),
+        name_field: "name",
+    }
+}
+
+/// Returns the grammar for the Java programming language.
+fn java_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_java::LANGUAGE.into(),
+        comment_nodes: vec!["line_comment", "block_comment"].into_iter().collect(),
+        string_literal_nodes: vec!["string_literal"].into_iter().collect(),
+        loop_nodes: vec![
+            "for_statement",
+            "enhanced_for_statement",
+            "while_statement",
+            "do_statement",
+        ]
+        .into_iter()
+        .collect(),
+        cond_nodes: vec!["if_statement", "ternary_expression", "switch_expression"]
+            .into_iter()
+            .collect(),
+        function_nodes: vec!["method_declaration", "compact_constructor_declaration"]
+            .into_iter()
+            .collect(),
+        function_call_nodes: vec!["method_invocation", "explicit_constructor_invocation"]
+            .into_iter()
+            .collect(),
+        param_seq_nodes: vec!["formal_parameters"].into_iter().collect(),
+        param_nodes: vec!["formal_parameter"].into_iter().collect(),
+        param_type_field: Some("type"),
+        name_field: "name",
+    }
+}
+
+/// Returns the grammar for the Scala programming language.
+fn scala_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_scala::LANGUAGE.into(),
+        comment_nodes: vec!["comment", "block_comment"].into_iter().collect(),
+        string_literal_nodes: vec!["string"].into_iter().collect(),
+        loop_nodes: vec!["for_expression", "while_expression", "do_while_expression"]
+            .into_iter()
+            .collect(),
+        cond_nodes: vec!["if_expression", "match_expression"]
+            .into_iter()
+            .collect(),
+        function_nodes: vec!["function_definition"].into_iter().collect(),
+        function_call_nodes: vec!["call_expression"].into_iter().collect(),
+        param_seq_nodes: vec!["parameters"].into_iter().collect(),
+        param_nodes: vec!["parameter"].into_iter().collect(),
+        param_type_field: Some("type"),
+        name_field: "name",
+    }
+}
+
+/// Returns the grammar for the Fortran programming language.
+fn fortran_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_fortran::LANGUAGE.into(),
+        comment_nodes: vec!["preproc_comment", "comment"].into_iter().collect(),
+        string_literal_nodes: vec!["string_literal"].into_iter().collect(),
+        loop_nodes: vec![
+            "loop_control_expression",
+            "where_statement",
+            "forall_statement",
+            "concurrent_statement",
+            "while_statement",
+        ]
+        .into_iter()
+        .collect(),
+        cond_nodes: vec![
+            "if_statement",
+            "arithmetic_if_statement",
+            "select_case_statement",
+            "select_rank_statement",
+            "select_type_statement",
+        ]
+        .into_iter()
+        .collect(),
+        function_nodes: vec!["function", "subroutine"].into_iter().collect(),
+        function_call_nodes: vec!["call_expression", "subroutine_call"]
+            .into_iter()
+            .collect(),
+        param_seq_nodes: vec!["parameters"].into_iter().collect(),
+        param_nodes: vec!["identifier"].into_iter().collect(),
+        param_type_field: None,
+        name_field: "name",
+    }
+}
+
+/// Returns the grammar for the Python programming language.
+fn python_grammar() -> Grammar {
+    Grammar {
+        lang: tree_sitter_python::LANGUAGE.into(),
+        comment_nodes: vec!["comment"].into_iter().collect(),
+        string_literal_nodes: vec!["string"].into_iter().collect(),
+        loop_nodes: vec!["for_statement", "while_statement"]
+            .into_iter()
+            .collect(),
+        cond_nodes: vec!["if_statement", "conditional_expression", "match_statement"]
+            .into_iter()
+            .collect(),
+        function_nodes: vec!["function_definition", "lambda"].into_iter().collect(),
+        function_call_nodes: vec!["call"].into_iter().collect(),
+        param_seq_nodes: vec!["parameters"].into_iter().collect(),
+        param_nodes: vec!["parameter"].into_iter().collect(),
+        param_type_field: None,
+        name_field: "name",
+    }
+}
+
+/// Returns the grammar corresponding to the given language.
+///
+/// # Arguments
+///
+/// * `language` - The language of the file.
+///
+/// # Returns
+///
+/// The grammar corresponding to the language or `None` if the language is not supported.
+fn language_to_grammar(lang: &str) -> Option<Grammar> {
+    match lang {
+        "c" => Some(c_grammar()),
+        "c++" => Some(cpp_grammar()),
+        "c#" => Some(cs_grammar()),
+        "java" => Some(java_grammar()),
+        "fortran" => Some(fortran_grammar()),
+        "python" => Some(python_grammar()),
+        "typescript" => Some(ts_grammar()),
+        "go" => Some(go_grammar()),
+        "scala" => Some(scala_grammar()),
+        _ => None,
+    }
+}
+
+/// Counts the number of nodes of given kinds in a tree.
+///
+/// # Arguments
+///
+/// * `node` - The root node of the tree.
+/// * `kind` - The kinds of nodes to count.
+///
+/// # Returns
+///
+/// A tuple containing the number of nodes of the given kind and the maximum nesting level of these nodes.
+///
+/// # Example
+///
+/// The function applied to a node representing the following code will return `(2, 2)` if the kind is `if_statement`:
+///
+/// ```c
+/// int main(int a, int b) {
+///     if (b > 0) {
+///         if (a > b) {
+///             return a;
+///         } else {
+///             return b;
+///         }
+///     }
+///     return 0;
+///  }
+/// ```
+///
+fn count_nodes_of_kind(root: &Node, kinds: &HashSet<&str>) -> (usize, usize) {
+    let mut node_count = 0;
+    let mut max_nesting = 0;
+
+    let mut cursor = root.walk();
+
+    // Simulating call stack
+    let mut call_stack: Vec<(Node, usize)> = Vec::new();
+    call_stack.push((*root, 1));
+
+    while let Some((node, depth)) = call_stack.pop() {
+        let is_of_kind = kinds.contains(node.kind());
+
+        if is_of_kind {
+            node_count += 1;
+            max_nesting = max_nesting.max(depth);
+        }
+
+        // We don't reverse nodes for performance (yields the same result)
+        for child in node.children(&mut cursor) {
+            call_stack.push((child, if is_of_kind { depth + 1 } else { depth }));
+        }
+    }
+
+    (node_count, max_nesting)
+}
+
+fn find_first_node<'a>(
+    node: &Node<'a>,
+    pred: &dyn Fn(&Node) -> bool,
+    breadth: bool,
+) -> Vec<Node<'a>> {
+    let mut cursor = node.walk();
+    let mut call_stack: Vec<(Node, usize)> = Vec::new();
+    call_stack.push((*node, 0));
+
+    let mut res: Vec<Node<'a>> = Vec::new();
+    let mut max_depth: Option<usize> = None;
+
+    while let Some((node, depth)) = call_stack.pop() {
+        if max_depth.filter(|&d| depth > d).is_some() {
+            return res;
+        } else if pred(&node) {
+            if breadth {
+                res.push(node);
+                if max_depth.is_none() {
+                    max_depth = Some(depth);
+                }
+            } else {
+                return vec![node];
+            }
+        } else if breadth {
+            let mut end_queue: Vec<(Node, usize)> =
+                node.children(&mut cursor).map(|c| (c, depth + 1)).collect();
+            end_queue.extend(call_stack);
+            call_stack = end_queue;
+        } else {
+            for c in node
+                .children(&mut cursor)
+                .collect::<Vec<_>>()
+                .into_iter()
+                .rev()
+            {
+                call_stack.push((c, 0));
+            }
+        }
+    }
+    vec![]
+}
+
+fn find_first_node_of_kind<'a>(
+    root: &Node<'a>,
+    kind: &HashSet<&str>,
+    breadth: bool,
+) -> Vec<Node<'a>> {
+    find_first_node(root, &|n: &Node| kind.contains(n.kind()), breadth)
+}
+
+/// Finds the first error node in the tree
+///
+/// # Arguments
+///
+/// * `root` - The root node of the tree.
+///
+/// # Returns
+///
+/// The first error node found in the tree, or `None` if no error node is found.
+fn find_first_error_node<'a>(root: &Node<'a>) -> Option<Node<'a>> {
+    find_first_node(root, &|n: &Node| n.is_error() || n.is_missing(), false)
+        .into_iter()
+        .next()
+}
+
+fn find_first_error_position(root: &Node) -> Option<(usize, usize)> {
+    find_first_error_node(root).map(|n| (n.start_position().row + 1, n.start_position().column + 1))
+}
+
+fn position_to_string(position: Option<(usize, usize)>) -> String {
+    match position {
+        Some((row, col)) => format!("{}:{}", row, col),
+        None => "not-found".to_string(),
+    }
+}
+
+fn find_fields<'a>(root: &Node<'a>, field: &str) -> Vec<Node<'a>> {
+    let mut res: Vec<Node<'a>> = Vec::new();
+    let mut ids: HashSet<usize> = HashSet::new();
+
+    let mut cursor = root.walk();
+
+    // Simulating call stack
+    let mut call_stack: Vec<Node> = Vec::new();
+    call_stack.push(*root);
+
+    while let Some(node) = call_stack.pop() {
+        for c in node.children_by_field_name(field, &mut node.walk()) {
+            res.push(c);
+            ids.insert(c.id());
+        }
+
+        // We don't reverse nodes for performance (yields the same result)
+        for c in node
+            .children(&mut cursor)
+            .collect::<Vec<_>>()
+            .into_iter()
+            .rev()
+        {
+            if !ids.contains(&c.id()) {
+                call_stack.push(c);
+            }
+        }
+    }
+
+    res
+}
+
+fn find_first_field<'a>(root: &Node<'a>, field: &str) -> Option<Node<'a>> {
+    let mut cursor = root.walk();
+
+    // Simulating call stack
+    let mut call_stack: Vec<Node> = Vec::new();
+    call_stack.push(*root);
+
+    while let Some(node) = call_stack.pop() {
+        if let Some(c) = node.child_by_field_name(field) {
+            return Some(c);
+        }
+
+        // We don't reverse nodes for performance (yields the same result)
+        for c in node
+            .children(&mut cursor)
+            .collect::<Vec<_>>()
+            .into_iter()
+            .rev()
+        {
+            call_stack.push(c);
+        }
+    }
+
+    None
+}
+
+fn find_kind<'a>(root: &Node<'a>, kinds: &HashSet<&str>) -> Vec<Node<'a>> {
+    let mut res: Vec<Node<'a>> = Vec::new();
+
+    let mut cursor = root.walk();
+
+    // Simulating call stack
+    let mut call_stack: Vec<Node> = Vec::new();
+    call_stack.push(*root);
+
+    while let Some(node) = call_stack.pop() {
+        if kinds.contains(node.kind()) {
+            res.push(node);
+        } else {
+            // We don't reverse nodes for performance (yields the same result)
+            for c in node.children(&mut cursor) {
+                call_stack.push(c);
+            }
+        }
+    }
+
+    res
+}
+
+fn remove_kind_from_source(source: &[u8], root: &Node, kinds: &HashSet<&str>) -> Vec<u8> {
+    let mut nodes = find_kind(root, kinds);
+    nodes.sort_by_key(|b| std::cmp::Reverse(b.start_byte()));
+    // Disable mutability
+    let nodes = nodes;
+
+    let root_start = root.start_byte();
+    let mut new_source = source.to_vec();
+    for n in nodes {
+        new_source.drain(n.start_byte() - root_start..n.end_byte() - root_start);
+    }
+    new_source
+}
+
+//--------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use std::path::Path;
+
+    use polars::prelude::SortMultipleOptions;
+
+    use crate::utils::dataframes::*;
+    use crate::utils::fs::*;
+
+    use super::*;
+
+    const TEST_DATA: &str = "tests/data/phases/parse";
+
+    fn test_parse(input_file_path: &str, languages: Option<Vec<&str>>, should_pass: bool) {
+        let input_df = open_csv(&input_file_path, None, None);
+        assert!(input_df.is_ok());
+        let input_df = input_df.unwrap();
+        assert!(has_column(&input_df, "name"));
+        let input_df = input_df.column("name").unwrap().str().unwrap();
+
+        let output_file_path = format!("{}.functions.csv", input_file_path);
+        assert!(delete_file(&output_file_path, true).is_ok());
+
+        let logs_file_path = format!("{}.function_logs.csv", input_file_path);
+        assert!(delete_file(&logs_file_path, true).is_ok());
+
+        for path in input_df {
+            assert!(path.is_some());
+            assert!(delete_dir(&format!("{}.functions", path.unwrap()), true).is_ok());
+        }
+
+        if should_pass {
+            assert!(run(
+                input_file_path,
+                None,
+                None,
+                languages,
+                "ignore",
+                8,
+                0,
+                false,
+                &mut Logger::new()
+            )
+            .is_ok());
+
+            let logs_df = open_csv(&logs_file_path, None, None);
+            assert!(logs_df.is_ok());
+            let logs_df = logs_df.unwrap();
+            assert!(has_column(&logs_df, "name"));
+            let sorted_logs_df = logs_df
+                .sort(vec!["name"], SortMultipleOptions::new())
+                .unwrap();
+
+            let expected_logs_df = open_csv(
+                &format!("{}.function_logs.csv.expected", input_file_path),
+                None,
+                None,
+            );
+            assert!(expected_logs_df.is_ok());
+            let expected_logs_df = expected_logs_df.unwrap();
+            assert!(has_column(&expected_logs_df, "name"));
+            let sorted_expected_logs_df = expected_logs_df
+                .sort(vec!["name"], SortMultipleOptions::new())
+                .unwrap();
+            assert!(sorted_expected_logs_df.equals(&sorted_logs_df));
+
+            let output_df = open_csv(&output_file_path, None, None);
+            assert!(output_df.is_ok());
+            let output_df = output_df.unwrap();
+            assert!(has_column(&output_df, "path"));
+            let sorted_output_df = output_df
+                .sort(vec!["path"], SortMultipleOptions::new())
+                .unwrap();
+
+            let expected_df = open_csv(&format!("{}.expected", output_file_path), None, None);
+            assert!(expected_df.is_ok());
+            let expected_df = expected_df.unwrap();
+            assert!(has_column(&expected_df, "path"));
+            let sorted_expected_df = expected_df
+                .sort(vec!["path"], SortMultipleOptions::new())
+                .unwrap();
+
+            assert!(sorted_expected_df.equals(&sorted_output_df));
+
+            for path in sorted_output_df.column("path").unwrap().str().unwrap() {
+                assert!(path.is_some());
+                let path = Path::new(path.unwrap());
+                assert!(path.exists());
+                let expected_path_name = format!(
+                    "{}.expected/{}",
+                    path.parent().unwrap().to_str().unwrap(),
+                    path.file_name().unwrap().to_str().unwrap()
+                );
+                let expected_path = Path::new(&expected_path_name);
+                assert_eq!(
+                    std::fs::read_to_string(path).unwrap(),
+                    std::fs::read_to_string(expected_path).unwrap()
+                );
+            }
+        } else {
+            assert!(run(
+                input_file_path,
+                None,
+                None,
+                languages,
+                "ignore",
+                8,
+                0,
+                false,
+                &mut Logger::new()
+            )
+            .is_err());
+        }
+
+        assert!(delete_file(&output_file_path, true).is_ok());
+        assert!(delete_file(&logs_file_path, true).is_ok());
+
+        for path in input_df {
+            assert!(path.is_some());
+            assert!(delete_dir(&format!("{}.functions", path.unwrap()), true).is_ok());
+        }
+    }
+
+    #[test]
+    fn parse_fp() {
+        let input_file_path = format!("{}/to_parse.csv", TEST_DATA);
+
+        test_parse(&input_file_path, None, true);
+    }
+
+    #[test]
+    fn parse_go() {
+        let input_file_path = format!("{}/parse_go.csv", TEST_DATA);
+
+        test_parse(&input_file_path, None, true);
+    }
+
+    #[test]
+    fn invalid_file() {
+        let input_file_path = format!("{}/invalid.csv", TEST_DATA);
+
+        test_parse(&input_file_path, None, true);
+    }
+
+    #[test]
+    fn invalid_lang() {
+        let input_file_path = format!("{}/empty.csv", TEST_DATA);
+
+        test_parse(&input_file_path, Some(["rust"].to_vec()), false);
+    }
+
+    #[test]
+    fn empty() {
+        let input_file_path = format!("{}/empty.csv", TEST_DATA);
+
+        test_parse(&input_file_path, Some(["c"].to_vec()), true);
+    }
+}
diff --git a/src/phases/mod.rs b/src/phases/mod.rs
index 03ab30c..d70647e 100644
--- a/src/phases/mod.rs
+++ b/src/phases/mod.rs
@@ -24,3 +24,5 @@ pub mod languages;
 pub mod metadata;
 pub mod parse;
 pub mod pull_request;
+//pub mod tokenizer;
+pub mod alt_parse;

From bbfcb17a43018984acd66837af9edb4f75c40f9e Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Fri, 6 Mar 2026 16:22:42 +0100
Subject: [PATCH 02/14] implemented global count of tokens

---
 src/bin/main.rs         |  12 ++-
 src/phases/mod.rs       |   4 +-
 src/phases/tokenizer.rs | 209 ++++++++++++++++++++++++++++++++++++++++
 src/utils/bow.rs        |  28 ++++++
 4 files changed, 250 insertions(+), 3 deletions(-)
 create mode 100644 src/phases/tokenizer.rs

diff --git a/src/bin/main.rs b/src/bin/main.rs
index ea61c47..ba4b110 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -15,7 +15,7 @@
 use clap::{Arg, ArgAction, Command};
 use scyros::phases::{
     alt_parse, download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages,
-    filter_metadata, forks, ids, languages, metadata, parse, pull_request,
+    filter_metadata, forks, ids, languages, metadata, parse, pull_request, tokenizer,
 };
 use scyros::utils::error::*;
 use scyros::utils::logger::Logger;
@@ -37,6 +37,7 @@ fn cli() -> Command {
         .subcommand(parse::cli())
         .subcommand(extract_benchmarks::cli())
         .subcommand(alt_parse::cli())
+        .subcommand(tokenizer::cli())
         .arg(
             Arg::new("debug")
                 .long("debug")
@@ -231,6 +232,15 @@ fn main() {
                                     &mut logger,
                                 )
                             }
+                            else if subcommand == tokenizer::cli().get_name() {
+                                tokenizer::run(
+                                    cli_subargs.get_one::<String>("input").unwrap(),
+                                    //cli_subargs.get_one::<String>("output").map(|x| x.as_str()),
+                                    //cli_subargs.get_one::<String>("language").unwrap(),
+                                    cli_subargs.get_one::<String>("example_word").unwrap(),
+                                    &mut logger,
+                                )
+                            }
                             else {
                                 Error::new(&format!("The subcommand {} is not available. Run the program with the --help flag to see the list of subcommands", subcommand)).to_res()
                             }
diff --git a/src/phases/mod.rs b/src/phases/mod.rs
index d70647e..82cf74c 100644
--- a/src/phases/mod.rs
+++ b/src/phases/mod.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod alt_parse;
 pub mod download;
 pub mod duplicate_files;
 pub mod duplicate_ids;
@@ -24,5 +25,4 @@ pub mod languages;
 pub mod metadata;
 pub mod parse;
 pub mod pull_request;
-//pub mod tokenizer;
-pub mod alt_parse;
+pub mod tokenizer;
diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs
new file mode 100644
index 0000000..612ed5e
--- /dev/null
+++ b/src/phases/tokenizer.rs
@@ -0,0 +1,209 @@
+/* use std::arch::global_asm;
+use std::collections::HashMap;
+use std::f32::consts::E; */
+
+use crate::utils::bow::*;
+use crate::utils::error::*;
+use crate::utils::fs::*;
+use crate::utils::regex::*;
+use crate::utils::{/* csv::*,  */ logger::Logger};
+/* use clang::token;
+use polars::frame::row; */
+use clap::{Arg, /* ArgAction, */ Command};
+use polars::prelude::*;
+
+/* struct Token {
+    word: String,
+    local_count: usize,
+    global_count: usize,
+    global_position: usize,
+} */
+
+pub fn cli() -> Command {
+    Command::new("tokenizer")
+        .about("Tokenizes the functions in the input file and generates a global Bag of Words.")
+        .disable_version_flag(true)
+        .arg(
+            Arg::new("input")
+                .short('i')
+                .long("input")
+                .value_name("INPUT_PATH")
+                .help("Path to the input CSV file generated by the parser.")
+                .required(true),
+        )
+        .arg(
+            Arg::new("example_word")
+                .short('e')
+                .long("example-word")
+                .value_name("EXAMPLE_WORD")
+                .help("An example word to check the global Bag of Words for.")
+                .required(false)
+                .default_value("if"),
+        )
+}
+
+pub fn run(
+    input_path: &str, //path to csv provided by parser
+    //output_path: &str,
+    //language: &str,
+    example_word: &str,
+    logger: &mut Logger,
+) -> Result<(), Error> {
+    //No checks for language yet. Just uses java for now. Will add more languages later.
+    let language = "java";
+    let minimum_loc = 5; //temporary
+                         //let separators = vec!["(", ")", "[", "]", "{", "}", ";", ".", ",", ":", "=", "+", "-", "*", "/", "%", "<", ">", "&", "|", "!", "?", "~", "^", "#", "$", "@", "\"", "\\", "`", "'"]; //hardcoded separators for now. Will add more later and make it configurable.
+
+    let mut input_file = open_csv(
+        input_path,
+        Some(Schema::from_iter(vec![
+            Field::new("id".into(), DataType::UInt32),
+            Field::new("path".into(), DataType::String),
+            Field::new("name".into(), DataType::String),
+            Field::new("position".into(), DataType::String),
+            Field::new("loc".into(), DataType::UInt32),
+            Field::new("words".into(), DataType::UInt32),
+        ])),
+        Some(vec![
+            "id", "path", "name", "position", "language", "loc", "words",
+        ]),
+    )?;
+
+    let n_functions_before_language = input_file.height();
+    logger.log(&format!(
+        "  {} functions found in the input file, filtering by selected language",
+        n_functions_before_language
+    ))?;
+
+    //input_file = input_file.filter(&input_file.column("language")?.equal(language));
+    input_file = map_err(
+        input_file
+            .lazy()
+            .filter(col("language").eq(lit(language)))
+            .collect(),
+        "Error filtering language",
+    )?;
+
+    let n_functions_after_language = input_file.height();
+
+    logger.log(&format!(
+        "  {} functions found after filtering ({:.2} %)",
+        n_functions_after_language,
+        if n_functions_before_language == 0 {
+            0
+        } else {
+            n_functions_after_language / n_functions_before_language * 100
+        }
+    ))?;
+
+    let n_functions_before_loc = input_file.height();
+
+    logger.log(&format!(
+        "{} functions found in the input file. Filtering those with less than {} lines of code.",
+        n_functions_before_loc, minimum_loc
+    ))?;
+
+    //input_file = input_file.filter(&input_file.column("loc")?.greater_equal(minimum_loc))?;
+
+    input_file = map_err(
+        input_file
+            .lazy()
+            .filter(col("loc").gt_eq(lit(minimum_loc)))
+            .collect(),
+        "Error filtering by lines of code",
+    )?;
+
+    let n_functions_after_loc = input_file.height();
+
+    logger.log(&format!(
+        "  {} functions found after filtering by lines of code ({:.2} %)", //something is weird with the percentage calculation here.
+        n_functions_after_loc,
+        if n_functions_before_loc == 0 {
+            0
+        } else {
+            n_functions_after_loc / n_functions_before_loc * 100
+        }
+    ))?;
+    let global_bow = global_counter(&input_file, logger)?;
+
+    let token_rankings = global_bow.token_rankings();
+
+    let example_word = example_word.to_ascii_lowercase();
+    let example_word_token = example_word.as_bytes();
+
+    logger.log("Tokenizer seems to have completed")?;
+    logger.log(&format!(
+        "  The token '{}' appears {} times and is ranked {} in the global Bag of Words.",
+        example_word,
+        token_rankings
+            .get(example_word_token)
+            .map(|(count, _)| *count)
+            .unwrap_or(0),
+        token_rankings
+            .get(example_word_token)
+            .map(|(_, rank)| *rank)
+            .unwrap_or(0)
+    ))?;
+
+    Ok(())
+}
+
+fn global_counter(input_file: &DataFrame, logger: &mut Logger) -> Result<Bow, Error> {
+    let word_matcher: Matcher = Matcher::words_matcher();
+    let mut global_bow: Bow = Bow::new();
+
+    for row in input_file
+        .column("path")
+        .and_then(|c| c.str())
+        .unwrap()
+        .into_iter()
+    {
+        match row {
+            Some(path) => {
+                //let function_code = std::fs::read_to_string(path)?;
+                match load_file(path, 1024 * 1024 * 1024) {
+                    Ok(Ok(function_code)) => {
+                        let local_bow =
+                            word_matcher.bag_of_words(&function_code.to_ascii_lowercase());
+                        global_bow.merge(local_bow);
+                    }
+                    Ok(Err(_e)) => {
+                        logger.log(&format!("  Warning: File to large at path {}", path))?;
+                    }
+                    Err(_e) => {
+                        logger.log(&format!("  Warning: Could not load file at path {}", path))?;
+                    }
+                }
+            }
+            None => {
+                let _ = logger.log("  Warning: Path not found");
+            }
+        }
+    }
+
+    Ok(global_bow)
+}
+
+/* fn tokenize_function(
+    function_code_path: &str,
+    separators: &Vec<&str>,
+    logger: &mut Logger
+) -> Result<(HashMap<String, usize>), Error> {
+    let function_string = std::fs::read_to_string(function_code_path)?;
+
+
+    let mut tokenized_string = function_string.clone();
+    for separator in separators {
+        tokenized_string = tokenized_string.replace(separator, " ");
+    }
+
+    let words_in_string: Vec<&str> = tokenized_string.split_whitespace().collect();
+
+    let mut counts: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
+
+    for word in words_in_string {
+        *counts.entry(word.to_string()).or_insert(0) += 1;
+    }
+
+    Ok(counts)
+} */
diff --git a/src/utils/bow.rs b/src/utils/bow.rs
index 7676bad..4c2b347 100644
--- a/src/utils/bow.rs
+++ b/src/utils/bow.rs
@@ -81,6 +81,34 @@ impl Bow {
             .join("|")
             .into_bytes()
     }
+
+    /// Merges another Bag of Words into this one, summing the counts of shared tokens.
+    ///
+    /// # Arguments
+    ///
+    /// * `other` - The other Bag of Words to be merged into this one.
+    pub fn merge(&mut self, other: Bow) {
+        for (token, count) in other.map {
+            *self.map.entry(token).or_insert(0) += count;
+        }
+    }
+
+    /// Generates a ranking of tokens based on their frequency in the Bag of Words.
+    /// The ranking is a HashMap where the key is the token and the value is a tuple containing the frequency and the rank (1-based index).
+    /// Returns a HashMap where the key is the token and the value is a tuple containing the frequency and the rank.
+    pub fn token_rankings(&self) -> HashMap<Vec<u8>, (usize, usize)> {
+        let mut rankings: HashMap<Vec<u8>, (usize, usize)> = HashMap::new();
+        let mut count_vec: Vec<(&Vec<u8>, &usize)> = self.map.iter().collect();
+        //count_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count in descending order
+        count_vec.sort_by(|a, b| {
+            b.1.cmp(a.1) // primary: count descending
+                .then_with(|| a.0.cmp(b.0)) // secondary: token ascending
+        });
+        for (rank, (token, count)) in count_vec.into_iter().enumerate() {
+            rankings.insert(token.clone(), (*count, rank + 1));
+        }
+        rankings
+    }
 }
 
 #[cfg(test)]

From c7cad4a1ee42162f42a73d4a4fa27b5d377edb63 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Mon, 9 Mar 2026 14:34:36 +0100
Subject: [PATCH 03/14] updated code to work with refactoring of upstream
 repository. alt_parse removed for now

---
 .gitignore              |  1 +
 src/bin/main.rs         |  9 +++--
 src/phases/mod.rs       |  2 +-
 src/phases/tokenizer.rs | 75 +++++++++++++++++++----------------------
 4 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5e969b2..487d496 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ data/
 ghtokens.csv
 !tests/data
 outputs/
+src/phases/alt_parse.rs //Got broken by refactoring but is not in use currently, so ignoring for now
 
 *.zip
 // Negation pattern in .gitignore
diff --git a/src/bin/main.rs b/src/bin/main.rs
index 721b90c..cea9fb4 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -15,7 +15,7 @@
 use anyhow::{anyhow, Context, Result};
 use clap::{Arg, ArgAction, Command};
 use scyros::phases::{
-    alt_parse, download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages,
+    download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages,
     filter_metadata, forks, ids, languages, metadata, parse, pull_request, tokenizer,
 };
 use scyros::utils::logger::Logger;
@@ -37,7 +37,6 @@ fn cli() -> Command {
         .subcommand(duplicate_files::cli())
         .subcommand(parse::cli())
         .subcommand(extract_benchmarks::cli())
-        .subcommand(alt_parse::cli())
         .subcommand(tokenizer::cli())
         .arg(
             Arg::new("debug")
@@ -216,7 +215,7 @@ fn main() {
                                     &logger,
                                 )
                             }
-                            else if subcommand == alt_parse::cli().get_name() {
+                            /* else if subcommand == alt_parse::cli().get_name() {
                                 alt_parse::run(
                                     cli_subargs.get_one::<String>("input").unwrap(),
                                     cli_subargs.get_one::<String>("output").map(|x| x.as_str()),
@@ -232,14 +231,14 @@ fn main() {
                                     cli_subargs.get_flag("force"),
                                     &mut logger,
                                 )
-                            }
+                            } */
                             else if subcommand == tokenizer::cli().get_name() {
                                 tokenizer::run(
                                     cli_subargs.get_one::<String>("input").unwrap(),
                                     //cli_subargs.get_one::<String>("output").map(|x| x.as_str()),
                                     //cli_subargs.get_one::<String>("language").unwrap(),
                                     cli_subargs.get_one::<String>("example_word").unwrap(),
-                                    &mut logger,
+                                    &logger,
                                 )
                             }
                             else {
diff --git a/src/phases/mod.rs b/src/phases/mod.rs
index 82cf74c..58c2135 100644
--- a/src/phases/mod.rs
+++ b/src/phases/mod.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-pub mod alt_parse;
+//pub mod alt_parse;
 pub mod download;
 pub mod duplicate_files;
 pub mod duplicate_ids;
diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs
index 612ed5e..61a16dc 100644
--- a/src/phases/tokenizer.rs
+++ b/src/phases/tokenizer.rs
@@ -3,17 +3,18 @@ use std::collections::HashMap;
 use std::f32::consts::E; */
 
 use crate::utils::bow::*;
-use crate::utils::error::*;
 use crate::utils::fs::*;
 use crate::utils::regex::*;
 use crate::utils::{/* csv::*,  */ logger::Logger};
+use anyhow::{/* anyhow, bail, Context,  */ Result};
+use tracing::info;
 /* use clang::token;
 use polars::frame::row; */
 use clap::{Arg, /* ArgAction, */ Command};
 use polars::prelude::*;
 
 /* struct Token {
-    word: String,
+    word: Vec<u8>,
     local_count: usize,
     global_count: usize,
     global_position: usize,
@@ -47,8 +48,8 @@ pub fn run(
     //output_path: &str,
     //language: &str,
     example_word: &str,
-    logger: &mut Logger,
-) -> Result<(), Error> {
+    _logger: &Logger, //not used currently but hopefully will later
+) -> Result<()> {
     //No checks for language yet. Just uses java for now. Will add more languages later.
     let language = "java";
     let minimum_loc = 5; //temporary
@@ -70,69 +71,61 @@ pub fn run(
     )?;
 
     let n_functions_before_language = input_file.height();
-    logger.log(&format!(
-        "  {} functions found in the input file, filtering by selected language",
+    info!(
+        "{} functions found in the input file, filtering by selected language",
         n_functions_before_language
-    ))?;
+    );
 
     //input_file = input_file.filter(&input_file.column("language")?.equal(language));
-    input_file = map_err(
-        input_file
-            .lazy()
-            .filter(col("language").eq(lit(language)))
-            .collect(),
-        "Error filtering language",
-    )?;
+    input_file = input_file
+        .lazy()
+        .filter(col("language").eq(lit(language)))
+        .collect()?;
 
     let n_functions_after_language = input_file.height();
-
-    logger.log(&format!(
-        "  {} functions found after filtering ({:.2} %)",
+    info!(
+        "  {} files found after filtering ({:.2} %)",
         n_functions_after_language,
         if n_functions_before_language == 0 {
             0
         } else {
             n_functions_after_language / n_functions_before_language * 100
         }
-    ))?;
-
+    );
     let n_functions_before_loc = input_file.height();
 
-    logger.log(&format!(
-        "{} functions found in the input file. Filtering those with less than {} lines of code.",
-        n_functions_before_loc, minimum_loc
-    ))?;
+    info!(" {} functions found after filtering by language, filtering functions with less that {} lines of code.", n_functions_before_loc, minimum_loc);
 
     //input_file = input_file.filter(&input_file.column("loc")?.greater_equal(minimum_loc))?;
 
-    input_file = map_err(
-        input_file
-            .lazy()
-            .filter(col("loc").gt_eq(lit(minimum_loc)))
-            .collect(),
-        "Error filtering by lines of code",
-    )?;
+    input_file = input_file
+        .lazy()
+        .filter(col("loc").gt_eq(lit(minimum_loc)))
+        .collect()?;
 
     let n_functions_after_loc = input_file.height();
 
-    logger.log(&format!(
-        "  {} functions found after filtering by lines of code ({:.2} %)", //something is weird with the percentage calculation here.
+    info!(
+        "{} functions found after filtering  ({:.2} %)", //something is weird with the percentage calculation here.
         n_functions_after_loc,
         if n_functions_before_loc == 0 {
             0
         } else {
             n_functions_after_loc / n_functions_before_loc * 100
         }
-    ))?;
-    let global_bow = global_counter(&input_file, logger)?;
+    );
+    let global_bow = global_counter(&input_file)?;
 
     let token_rankings = global_bow.token_rankings();
 
     let example_word = example_word.to_ascii_lowercase();
     let example_word_token = example_word.as_bytes();
 
-    logger.log("Tokenizer seems to have completed")?;
-    logger.log(&format!(
+    info!(
+        "Global Bag of Words generated. Checking for example word '{}'",
+        example_word
+    );
+    info!(
         "  The token '{}' appears {} times and is ranked {} in the global Bag of Words.",
         example_word,
         token_rankings
@@ -143,12 +136,12 @@ pub fn run(
             .get(example_word_token)
             .map(|(_, rank)| *rank)
             .unwrap_or(0)
-    ))?;
+    );
 
     Ok(())
 }
 
-fn global_counter(input_file: &DataFrame, logger: &mut Logger) -> Result<Bow, Error> {
+fn global_counter(input_file: &DataFrame) -> Result<Bow> {
     let word_matcher: Matcher = Matcher::words_matcher();
     let mut global_bow: Bow = Bow::new();
 
@@ -168,15 +161,15 @@ fn global_counter(input_file: &DataFrame, logger: &mut Logger) -> Result<Bow, Er
                         global_bow.merge(local_bow);
                     }
                     Ok(Err(_e)) => {
-                        logger.log(&format!("  Warning: File to large at path {}", path))?;
+                        info!("  Warning: File to large at path {}", path);
                     }
                     Err(_e) => {
-                        logger.log(&format!("  Warning: Could not load file at path {}", path))?;
+                        info!("  Warning: Could not load file at path {}", path);
                     }
                 }
             }
             None => {
-                let _ = logger.log("  Warning: Path not found");
+                info!("  Warning: Path not found");
             }
         }
     }

From 1d36c5aa445793688da8ba7af4d5a9d355f874d4 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Mon, 9 Mar 2026 15:38:00 +0100
Subject: [PATCH 04/14] separated main function from cli call

---
 src/phases/tokenizer.rs | 63 ++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs
index 61a16dc..3bed8df 100644
--- a/src/phases/tokenizer.rs
+++ b/src/phases/tokenizer.rs
@@ -6,13 +6,13 @@ use crate::utils::bow::*;
 use crate::utils::fs::*;
 use crate::utils::regex::*;
 use crate::utils::{/* csv::*,  */ logger::Logger};
-use anyhow::{/* anyhow, bail, Context,  */ Result};
+use anyhow::{anyhow, /*bail, Context,  */ Result};
 use tracing::info;
 /* use clang::token;
 use polars::frame::row; */
 use clap::{Arg, /* ArgAction, */ Command};
 use polars::prelude::*;
-
+use std::collections::HashMap;
 /* struct Token {
     word: Vec<u8>,
     local_count: usize,
@@ -38,18 +38,42 @@ pub fn cli() -> Command {
                 .long("example-word")
                 .value_name("EXAMPLE_WORD")
                 .help("An example word to check the global Bag of Words for.")
-                .required(false)
-                .default_value("if"),
+                .required(false),
         )
 }
 
-pub fn run(
+pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()> {
+    let token_rankings = run_tokenizer(input_path, logger)?;
+
+    let example_word = example_word.to_ascii_lowercase();
+    let example_word_token = example_word.as_bytes();
+
+    info!(
+        "Global Bag of Words generated. Checking for example word '{}'",
+        example_word
+    );
+    info!(
+        "  The token '{}' appears {} times and is ranked {} in the global Bag of Words.",
+        example_word,
+        token_rankings
+            .get(example_word_token)
+            .map(|(count, _)| *count)
+            .unwrap_or(0),
+        token_rankings
+            .get(example_word_token)
+            .map(|(_, rank)| *rank)
+            .unwrap_or(0)
+    );
+
+    Ok(())
+}
+
+pub fn run_tokenizer(
     input_path: &str, //path to csv provided by parser
     //output_path: &str,
     //language: &str,
-    example_word: &str,
     _logger: &Logger, //not used currently but hopefully will later
-) -> Result<()> {
+) -> Result<HashMap<Vec<u8>, (usize, usize)>> {
     //No checks for language yet. Just uses java for now. Will add more languages later.
     let language = "java";
     let minimum_loc = 5; //temporary
@@ -116,29 +140,10 @@ pub fn run(
     );
     let global_bow = global_counter(&input_file)?;
 
-    let token_rankings = global_bow.token_rankings();
-
-    let example_word = example_word.to_ascii_lowercase();
-    let example_word_token = example_word.as_bytes();
+    let token_rankings: std::collections::HashMap<Vec<u8>, (usize, usize)> =
+        global_bow.token_rankings();
 
-    info!(
-        "Global Bag of Words generated. Checking for example word '{}'",
-        example_word
-    );
-    info!(
-        "  The token '{}' appears {} times and is ranked {} in the global Bag of Words.",
-        example_word,
-        token_rankings
-            .get(example_word_token)
-            .map(|(count, _)| *count)
-            .unwrap_or(0),
-        token_rankings
-            .get(example_word_token)
-            .map(|(_, rank)| *rank)
-            .unwrap_or(0)
-    );
-
-    Ok(())
+    Some(token_rankings).ok_or_else(|| anyhow!("No tokens found in the global Bag of Words."))
 }
 
 fn global_counter(input_file: &DataFrame) -> Result<Bow> {

From b830c46ee7fd38302c80fb8c34349f7df38525f4 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Mon, 9 Mar 2026 15:55:24 +0100
Subject: [PATCH 05/14] run_tokenizer now take a dataframe instead of an
 input_path

---
 src/phases/mod.rs       |  1 +
 src/phases/tokenizer.rs | 68 +++++++++++++++++++++--------------------
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/src/phases/mod.rs b/src/phases/mod.rs
index 58c2135..ffc02b3 100644
--- a/src/phases/mod.rs
+++ b/src/phases/mod.rs
@@ -26,3 +26,4 @@ pub mod metadata;
 pub mod parse;
 pub mod pull_request;
 pub mod tokenizer;
+//pub mod type_3_duplicate_files;
diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs
index 3bed8df..21900c4 100644
--- a/src/phases/tokenizer.rs
+++ b/src/phases/tokenizer.rs
@@ -43,38 +43,6 @@ pub fn cli() -> Command {
 }
 
 pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()> {
-    let token_rankings = run_tokenizer(input_path, logger)?;
-
-    let example_word = example_word.to_ascii_lowercase();
-    let example_word_token = example_word.as_bytes();
-
-    info!(
-        "Global Bag of Words generated. Checking for example word '{}'",
-        example_word
-    );
-    info!(
-        "  The token '{}' appears {} times and is ranked {} in the global Bag of Words.",
-        example_word,
-        token_rankings
-            .get(example_word_token)
-            .map(|(count, _)| *count)
-            .unwrap_or(0),
-        token_rankings
-            .get(example_word_token)
-            .map(|(_, rank)| *rank)
-            .unwrap_or(0)
-    );
-
-    Ok(())
-}
-
-pub fn run_tokenizer(
-    input_path: &str, //path to csv provided by parser
-    //output_path: &str,
-    //language: &str,
-    _logger: &Logger, //not used currently but hopefully will later
-) -> Result<HashMap<Vec<u8>, (usize, usize)>> {
-    //No checks for language yet. Just uses java for now. Will add more languages later.
     let language = "java";
     let minimum_loc = 5; //temporary
                          //let separators = vec!["(", ")", "[", "]", "{", "}", ";", ".", ",", ":", "=", "+", "-", "*", "/", "%", "<", ">", "&", "|", "!", "?", "~", "^", "#", "$", "@", "\"", "\\", "`", "'"]; //hardcoded separators for now. Will add more later and make it configurable.
@@ -138,7 +106,41 @@ pub fn run_tokenizer(
             n_functions_after_loc / n_functions_before_loc * 100
         }
     );
-    let global_bow = global_counter(&input_file)?;
+
+    let token_rankings = run_tokenizer(&input_file, logger)?;
+
+    let example_word = example_word.to_ascii_lowercase();
+    let example_word_token = example_word.as_bytes();
+
+    info!(
+        "Global Bag of Words generated. Checking for example word '{}'",
+        example_word
+    );
+    info!(
+        "  The token '{}' appears {} times and is ranked {} in the global Bag of Words.",
+        example_word,
+        token_rankings
+            .get(example_word_token)
+            .map(|(count, _)| *count)
+            .unwrap_or(0),
+        token_rankings
+            .get(example_word_token)
+            .map(|(_, rank)| *rank)
+            .unwrap_or(0)
+    );
+
+    Ok(())
+}
+
+pub fn run_tokenizer(
+    input_file: &DataFrame,
+    //output_path: &str,
+    //language: &str,
+    _logger: &Logger, //not used currently but hopefully will later
+) -> Result<HashMap<Vec<u8>, (usize, usize)>> {
+    //No checks for language yet. Just uses java for now. Will add more languages later.
+
+    let global_bow = global_counter(input_file)?;
 
     let token_rankings: std::collections::HashMap<Vec<u8>, (usize, usize)> =
         global_bow.token_rankings();

From 5f3fd89ea31f2df1e801b5e8e5ceea2ee43f4ef8 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Fri, 20 Mar 2026 16:18:17 +0100
Subject: [PATCH 06/14] inverted index and candidate map structures built

---
 src/bin/main.rs                      |  16 ++
 src/phases/mod.rs                    |   2 +-
 src/phases/tokenizer.rs              |  16 +-
 src/phases/type_3_duplicate_files.rs | 314 +++++++++++++++++++++++++++
 src/utils/bow.rs                     |   5 +
 src/utils/candidate_map.rs           |  77 +++++++
 src/utils/inverted_index.rs          |  54 +++++
 src/utils/mod.rs                     |   2 +
 8 files changed, 477 insertions(+), 9 deletions(-)
 create mode 100644 src/phases/type_3_duplicate_files.rs
 create mode 100644 src/utils/candidate_map.rs
 create mode 100644 src/utils/inverted_index.rs

diff --git a/src/bin/main.rs b/src/bin/main.rs
index cea9fb4..883911a 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -17,6 +17,7 @@ use clap::{Arg, ArgAction, Command};
 use scyros::phases::{
     download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages,
     filter_metadata, forks, ids, languages, metadata, parse, pull_request, tokenizer,
+    type_3_duplicate_files,
 };
 use scyros::utils::logger::Logger;
 use tracing::{error, info};
@@ -38,6 +39,7 @@ fn cli() -> Command {
         .subcommand(parse::cli())
         .subcommand(extract_benchmarks::cli())
         .subcommand(tokenizer::cli())
+        .subcommand(type_3_duplicate_files::cli())
         .arg(
             Arg::new("debug")
                 .long("debug")
@@ -241,6 +243,20 @@ fn main() {
                                     &logger,
                                 )
                             }
+                            else if subcommand == type_3_duplicate_files::cli().get_name() {
+                                type_3_duplicate_files::run(
+                                    cli_subargs.get_one::<String>("input").unwrap(),
+                                    cli_subargs.get_one::<String>("output").map(|x| x.as_str()),
+                                    cli_subargs.get_one::<String>("map").map(|x| x.as_str()),
+                                    cli_subargs.get_one::<String>("logs").map(|x| x.as_str()),
+                                    /* languages */
+                                    *cli_subargs.get_one::<usize>("threads").unwrap(),
+                                    *cli_subargs.get_one::<usize>("p_prefix").unwrap(),
+                                    *cli_subargs.get_one::<f64>("threshold").unwrap(),
+                                    cli_subargs.get_one::<String>("example_word"),
+                                    &logger,
+                                )
+                            }
                             else {
                                 Err(anyhow!("The subcommand {} is not available. Run the program with the --help flag to see the list of subcommands", subcommand))
                             }
diff --git a/src/phases/mod.rs b/src/phases/mod.rs
index ffc02b3..14c1323 100644
--- a/src/phases/mod.rs
+++ b/src/phases/mod.rs
@@ -26,4 +26,4 @@ pub mod metadata;
 pub mod parse;
 pub mod pull_request;
 pub mod tokenizer;
-//pub mod type_3_duplicate_files;
+pub mod type_3_duplicate_files;
diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs
index 21900c4..89f41ee 100644
--- a/src/phases/tokenizer.rs
+++ b/src/phases/tokenizer.rs
@@ -6,13 +6,13 @@ use crate::utils::bow::*;
 use crate::utils::fs::*;
 use crate::utils::regex::*;
 use crate::utils::{/* csv::*,  */ logger::Logger};
-use anyhow::{anyhow, /*bail, Context,  */ Result};
+use anyhow::{/*anyhow, bail, Context,  */ Result};
 use tracing::info;
 /* use clang::token;
 use polars::frame::row; */
 use clap::{Arg, /* ArgAction, */ Command};
 use polars::prelude::*;
-use std::collections::HashMap;
+//use std::collections::HashMap;
 /* struct Token {
     word: Vec<u8>,
     local_count: usize,
@@ -42,7 +42,7 @@ pub fn cli() -> Command {
         )
 }
 
-pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()> {
+pub fn run(input_path: &str, example_word: &str, _logger: &Logger) -> Result<()> {
     let language = "java";
     let minimum_loc = 5; //temporary
                          //let separators = vec!["(", ")", "[", "]", "{", "}", ";", ".", ",", ":", "=", "+", "-", "*", "/", "%", "<", ">", "&", "|", "!", "?", "~", "^", "#", "$", "@", "\"", "\\", "`", "'"]; //hardcoded separators for now. Will add more later and make it configurable.
@@ -107,8 +107,8 @@ pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()>
         }
     );
 
-    let token_rankings = run_tokenizer(&input_file, logger)?;
-
+    //let token_rankings = run_tokenizer(&input_file, logger)?;
+    let token_rankings = global_counter(&input_file)?.token_rankings();
     let example_word = example_word.to_ascii_lowercase();
     let example_word_token = example_word.as_bytes();
 
@@ -132,7 +132,7 @@ pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()>
     Ok(())
 }
 
-pub fn run_tokenizer(
+/* pub fn run_tokenizer(
     input_file: &DataFrame,
     //output_path: &str,
     //language: &str,
@@ -146,9 +146,9 @@ pub fn run_tokenizer(
         global_bow.token_rankings();
 
     Some(token_rankings).ok_or_else(|| anyhow!("No tokens found in the global Bag of Words."))
-}
+} */
 
-fn global_counter(input_file: &DataFrame) -> Result<Bow> {
+pub fn global_counter(input_file: &DataFrame) -> Result<Bow> {
     let word_matcher: Matcher = Matcher::words_matcher();
     let mut global_bow: Bow = Bow::new();
 
diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs
new file mode 100644
index 0000000..0618d67
--- /dev/null
+++ b/src/phases/type_3_duplicate_files.rs
@@ -0,0 +1,314 @@
+use crate::phases::tokenizer::global_counter;
+use crate::utils::fs::*;
+use crate::utils::inverted_index::*;
+use crate::utils::logger::Logger;
+use crate::utils::regex::*;
+use anyhow::{/* Error,  */ Result};
+use blake3;
+use clap::{Arg, Command};
+use core::f64;
+use polars::prelude::*;
+use std::cmp::Reverse;
+use std::collections::HashMap;
+use std::vec;
+use tracing::info;
+
+pub fn cli() -> Command {
+    Command::new("type_3_duplicate_files")
+        .about("Detects type 3 clones by building an index based on the most common tokens in the functions and their frequencies.")
+        .disable_version_flag(true)
+        .arg(
+            Arg::new("input")
+                .short('i')
+                .long("input")
+                .help("Path to the input CSV file generated by the parser.")
+                .required(true),
+        )
+        .arg(
+            Arg::new("output")
+                .short('o')
+                .long("output")
+                .help("Path to the output CSV file to store unique files metadata.")
+                .required(false),
+        )
+        .arg(
+            Arg::new("map")
+                .short('m')
+                .long("map")
+                .help("Path to the map CSV file to store the mapping of clones to their originals.")
+                .required(false),
+        )
+        .arg(
+            Arg::new("logs")
+                .short('l')
+                .long("logs")
+                .help("Path to the logs file to store error logs.")
+                .required(false),
+        )
+        .arg(
+            Arg::new("languages")
+                .short('g')
+                .long("languages")
+                .help("Comma-separated list of languages to consider. If not provided, all languages will be considered.")
+                .required(false),
+        )
+        .arg(
+            Arg::new("threads")
+                .short('n')
+                .help("Number of threads to use, default is 1.")
+                .default_value("1")
+                .value_parser(clap::value_parser!(usize))
+        )
+        .arg(
+            Arg::new("p_prefix")
+                .short('p')
+                .long("p_prefix")
+                .default_value("1")
+                .help("Number of tokens to consider for the prefix. Default is 1.")
+                .value_parser(clap::value_parser!(usize))
+        )
+        .arg(
+            Arg::new("threshold")
+                .short('t')
+                .long("threshold")
+                .help("Similarity threshold. Default is 0.8.")
+                .default_value("0.8")
+                .value_parser(clap::value_parser!(f64))
+        )
+        .arg(
+            Arg::new("example_word")
+                .short('e')
+                .long("example-word")
+                .help("An example word to check the global Bag of Words for.")
+                .required(false),
+        )
+}
+
+pub fn run(
+    input_path: &str, //The path to the input CSV file storing the file paths, output of the parser phase
+    _output_path: Option<&str>, //optional path to the output CSV file to store unique files metadata.
+    _map_path: Option<&str>, //optional path to the map CSV file to store the mapping of clones to their originals.
+    _logs_path: Option<&str>, //for error logs, not implemented yet
+    /* _opt_languages: Option<Vec<&str>>, //optional list of languages. Currently java is hardcoded */
+    _threads: usize,               //current implementation is single-threaded
+    p_prefix: usize,               //number of tokens to consider for the prefix, default is 1
+    threshold: f64,                //threshold for the prefix length, default is 0.8
+    example_word: Option<&String>, //an example word to check the global Bag of Words for, optional
+    _logger: &Logger,
+) -> Result<()> {
+    let language = "java";
+    let minimum_loc = 5; //temporary
+    let mut input_file = open_csv(
+        input_path,
+        Some(Schema::from_iter(vec![
+            Field::new("id".into(), DataType::UInt32),
+            Field::new("path".into(), DataType::String),
+            Field::new("name".into(), DataType::String),
+            Field::new("position".into(), DataType::String),
+            Field::new("loc".into(), DataType::UInt32),
+            Field::new("words".into(), DataType::UInt32),
+        ])),
+        Some(vec![
+            "id", "path", "name", "position", "language", "loc", "words",
+        ]),
+    )?;
+
+    let n_functions_before_language = input_file.height();
+    info!(
+        "{} functions found in the input file, filtering by selected language",
+        n_functions_before_language
+    );
+
+    //input_file = input_file.filter(&input_file.column("language")?.equal(language));
+    input_file = input_file
+        .lazy()
+        .filter(col("language").eq(lit(language)))
+        .collect()?;
+
+    let n_functions_after_language = input_file.height();
+    info!(
+        "  {} files found after filtering ({:.2} %)",
+        n_functions_after_language,
+        if n_functions_before_language == 0 {
+            0
+        } else {
+            (n_functions_after_language as f64 / n_functions_before_language as f64 * 100.0)
+                as usize
+        }
+    );
+    let n_functions_before_loc = input_file.height();
+
+    info!(" {} functions found after filtering by language, filtering functions with less that {} lines of code.", n_functions_before_loc, minimum_loc);
+
+    //input_file = input_file.filter(&input_file.column("loc")?.greater_equal(minimum_loc))?;
+
+    input_file = input_file
+        .lazy()
+        .filter(col("loc").gt_eq(lit(minimum_loc)))
+        .collect()?;
+
+    let n_functions_after_loc = input_file.height();
+
+    info!(
+        "{} functions found after filtering  ({:.2} %)", //something is weird with the percentage calculation here.
+        n_functions_after_loc,
+        if n_functions_before_loc == 0 {
+            0
+        } else {
+            (n_functions_after_loc as f64 / n_functions_before_loc as f64 * 100.0) as usize
+        }
+    );
+    let global_bow = global_counter(&input_file)?;
+    let token_rankings = global_bow.token_rankings();
+    let vector_of_indices_plus_min_max =
+        index_builder(&input_file, token_rankings, p_prefix, threshold)?;
+    // Maximum and minimum 'words' in input file
+    let vector_of_indices = vector_of_indices_plus_min_max.0;
+    let min_words = vector_of_indices_plus_min_max.1 .0;
+    let max_words = vector_of_indices_plus_min_max.1 .1;
+    info!(
+        "Built {} indices with prefix scheme from 1 to {}, minimum words in a function: {}, maximum words in a function: {}.",
+        vector_of_indices.len(),
+        p_prefix,
+        min_words,
+        max_words
+    );
+
+    if let Some(word) = example_word {
+        let word = word.to_owned().as_bytes().to_ascii_lowercase();
+        let mut index_number = 1;
+        for index in vector_of_indices.iter() {
+            info!(
+                "Index {} has {} entries, total length of vectors in entries: {}",
+                index_number,
+                index.len(),
+                index.len_tokens()
+            );
+            if let Some(entries) = index.get(&word) {
+                info!(
+                    "Entries for the example word '{}' in index {}§:",
+                    String::from_utf8_lossy(&word),
+                    index_number
+                );
+                for (function_id, count) in entries {
+                    info!("Function ID: {}, Count: {}", function_id, count);
+                }
+            } else {
+                info!(
+                    "The example word '{}' was not found in index {}.",
+                    String::from_utf8_lossy(&word),
+                    index_number
+                );
+            }
+            index_number += 1;
+        }
+    }
+
+    //go through input file again? Means i can grab 'words' from the file. Could do something like just checking candidates
+
+    Ok(())
+}
+
+fn index_builder(
+    input_file: &DataFrame,
+    token_rankings: HashMap<Vec<u8>, (usize, usize)>,
+    p_prefix: usize,
+    threshold: f64,
+) -> Result<(Vec<InvertedIndex>, (usize, usize))> {
+    let word_matcher: Matcher = Matcher::words_matcher();
+
+    let mut vector_of_indices: Vec<InvertedIndex> = Vec::new();
+    for _i in 1..=p_prefix {
+        vector_of_indices.push(InvertedIndex::new());
+        //info!("Initialized index {}.", _i);
+    }
+    let mut min_words = usize::MAX;
+    let mut max_words = 0;
+    for path in input_file
+        .column("path")
+        .and_then(|c| c.str())
+        .unwrap()
+        .into_iter()
+        .flatten()
+    {
+        match load_file(path, 1024 * 1024 * 1024) {
+            Ok(Ok(function_code)) => {
+                let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase());
+                let mut vectored_bow = local_bow.vectorize();
+                vectored_bow.sort_by_key(|(token, _)| {
+                    Reverse(
+                        token_rankings
+                            .get(token)
+                            .map(|(_, rank)| *rank)
+                            .unwrap_or(usize::MAX),
+                    )
+                });
+                let codeblock_length = vectored_bow.iter().map(|(_, count)| count).sum::<usize>();
+
+                // Min and Max codeblock are in number of words, not tokens but seeing as they're only used for estimating verification cost they don't need to be precise
+                if codeblock_length < min_words {
+                    min_words = codeblock_length;
+                }
+                if codeblock_length > max_words {
+                    max_words = codeblock_length;
+                }
+                let _verification_cost_per_candidate_estimate =
+                    (min_words as f64 + max_words as f64) / 2.0;
+                //Temporarily shut off so the compiler doesn't complain about unused variables, will be used later
+                let prefix_length =
+                    codeblock_length - ((codeblock_length as f64) * threshold).round() as usize + 1;
+
+                let mut cumulative_count = 0;
+                let mut p = 1;
+                let function_id = blake3::hash(path.as_bytes());
+                //info!("Prefix length: {}, total tokens: {}, codeblock length: {}", prefix_length, vectored_bow.len(), codeblock_length);
+
+                for (token, count) in vectored_bow {
+                    cumulative_count += count;
+                    vector_of_indices[p - 1].add(&token, count, function_id);
+                    if cumulative_count >= prefix_length {
+                        if p == p_prefix {
+                            //info!("Prefix scheme {} added token {} with count {}", p, String::from_utf8_lossy(&token), count);
+                            break;
+                        } else {
+                            p += 1;
+                        }
+                    }
+                }
+            }
+            Ok(Err(_e)) => {
+                info!("Warning: File too large at path '{}', skipping.", path);
+            }
+            Err(_e) => {
+                info!("Failed to read file at path '{}', skipping.", path);
+            }
+        }
+    }
+    info!("Finished building indices.");
+    Ok((vector_of_indices, (min_words, max_words)))
+}
+
+/* fn delta_filter_cost(
+    prefix_vector: &Vec<(Vec<u8>, usize)>,
+    vector_of_indices: &Vec<InvertedIndex>,
+    p_prefix: usize,
+    &previous_cost: &usize,
+) -> usize {
+    let mut cost = 0;
+    if p_prefix == 1 {
+        for (token, _) in prefix_vector {
+            cost += vector_of_indices[0].token_frequency(token, false);
+        }
+    } else {
+        let last_token = prefix_vector.last().unwrap().0.clone();
+        for (token, _) in prefix_vector {
+            cost += vector_of_indices[p_prefix - 1].token_frequency(token, false);
+        }
+        for p in 1..(p_prefix - 1) {
+            // the previous for-loop already counted the last token for the current inverted_index
+            cost += vector_of_indices[p - 1].token_frequency(&last_token, false);
+        }
+    }
+    let total_cost = previous_cost + cost;
+    total_cost
+} */
diff --git a/src/utils/bow.rs b/src/utils/bow.rs
index 4c2b347..897b0a8 100644
--- a/src/utils/bow.rs
+++ b/src/utils/bow.rs
@@ -109,6 +109,11 @@ impl Bow {
         }
         rankings
     }
+
+    pub fn vectorize(self) -> Vec<(Vec<u8>, usize)> {
+        let vector: Vec<(Vec<u8>, usize)> = self.map.into_iter().collect();
+        vector
+    }
 }
 
 #[cfg(test)]
diff --git a/src/utils/candidate_map.rs b/src/utils/candidate_map.rs
new file mode 100644
index 0000000..47f0a77
--- /dev/null
+++ b/src/utils/candidate_map.rs
@@ -0,0 +1,77 @@
+use std::collections::{HashMap, HashSet};
+
+pub struct CandidateEntry {
+    pub matches: usize,
+    // Not sure if I want matches here as well or just in match_histogram.
+    //The benefit of having it here is that I can easily find what bucket the candidate is in in the histogram.
+    //The downside is that I have to update it here O(log n) when I update it in the histogram.
+    pub length: usize,
+    pub last_token_seen_pos: usize,
+}
+
+pub struct CandidateMap {
+    entries: HashMap<blake3::Hash, CandidateEntry>,
+    match_histogram: HashMap<usize, HashSet<blake3::Hash>>,
+}
+
+impl Default for CandidateMap {
+    fn default() -> Self {
+        CandidateMap::new()
+    }
+}
+
+impl CandidateMap {
+    pub fn new() -> Self {
+        Self {
+            entries: HashMap::new(),
+            match_histogram: HashMap::new(),
+        }
+    }
+
+    pub fn add_candidate(
+        &mut self,
+        function_id: blake3::Hash,
+        length: usize,
+        new_matches: usize,
+        last_token_seen_pos: usize,
+    ) {
+        let entry = self.entries.entry(function_id).or_insert(CandidateEntry {
+            matches: 0,
+            length,
+            last_token_seen_pos,
+        });
+
+        // Update the match histogram
+        if entry.matches > 0 {
+            if let Some(bucket) = self.match_histogram.get_mut(&entry.matches) {
+                bucket.remove(&function_id);
+            }
+        }
+
+        entry.matches += new_matches;
+        entry.length = length;
+        entry.last_token_seen_pos = last_token_seen_pos;
+
+        self.match_histogram
+            .entry(entry.matches)
+            .or_default()
+            .insert(function_id);
+    }
+
+    pub fn count_candidates_with_n_matches(&self, n: usize, mode: &str) -> usize {
+        if mode == "exact" {
+            self.match_histogram
+                .get(&n)
+                .map(|bucket| bucket.len())
+                .unwrap_or(0)
+        } else if mode == "at_least" {
+            self.match_histogram
+                .iter()
+                .filter(|(&matches, _)| matches >= n)
+                .map(|(_, bucket)| bucket.len())
+                .sum()
+        } else {
+            panic!("Invalid mode: {}", mode);
+        }
+    }
+}
diff --git a/src/utils/inverted_index.rs b/src/utils/inverted_index.rs
new file mode 100644
index 0000000..a976fd3
--- /dev/null
+++ b/src/utils/inverted_index.rs
@@ -0,0 +1,54 @@
+use blake3::Hash;
+use std::collections::HashMap;
+pub struct InvertedIndex {
+    map: HashMap<Vec<u8>, Vec<(Hash, usize)>>, // Maps tokens to a list of function IDs where they appear as well as the frequency of the token in that function
+}
+
+impl Default for InvertedIndex {
+    fn default() -> Self {
+        InvertedIndex::new()
+    }
+}
+
+impl InvertedIndex {
+    pub fn new() -> Self {
+        InvertedIndex {
+            map: HashMap::default(),
+        }
+    }
+
+    pub fn add(&mut self, token: &Vec<u8>, count: usize, function_id: Hash) {
+        self.map
+            .entry(token.to_owned())
+            .or_default()
+            .push((function_id, count));
+    }
+
+    pub fn get(&self, token: &Vec<u8>) -> Option<&Vec<(Hash, usize)>> {
+        self.map.get(token)
+    }
+
+    pub fn len(&self) -> usize {
+        self.map.len()
+    }
+
+    pub fn len_tokens(&self) -> usize {
+        self.map.values().map(|v| v.len()).sum()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.map.is_empty()
+    }
+
+    pub fn token_frequency(&self, token: &Vec<u8>, count_duplicates: bool) -> usize {
+        if let Some(functions) = self.get(token) {
+            if count_duplicates {
+                functions.iter().map(|(_, count)| *count).sum()
+            } else {
+                functions.len()
+            }
+        } else {
+            0
+        }
+    }
+}
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 3d01ca7..84192f7 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -13,11 +13,13 @@
 // limitations under the License.
 
 pub mod bow;
+pub mod candidate_map;
 pub mod csv;
 pub mod dataframes;
 pub mod fs;
 pub mod github;
 pub mod github_api;
+pub mod inverted_index;
 pub mod json;
 pub mod logger;
 pub mod regex;

From ba0fc06971f457292bbec63efae2831622448aa7 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Mon, 23 Mar 2026 14:25:30 +0100
Subject: [PATCH 07/14] renamed rust-toolchain back

---
 dummy_rust-toolchain.toml => rust-toolchain.toml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename dummy_rust-toolchain.toml => rust-toolchain.toml (100%)

diff --git a/dummy_rust-toolchain.toml b/rust-toolchain.toml
similarity index 100%
rename from dummy_rust-toolchain.toml
rename to rust-toolchain.toml

From 941aabaada59a61fa075d7309a7c9743a973bff9 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Tue, 7 Apr 2026 15:56:20 +0200
Subject: [PATCH 08/14] added detect_clones and verify_clones with accompanying
 data structures

---
 .gitignore                           |   3 +-
 src/phases/type_3_duplicate_files.rs | 378 ++++++++++++++++++++++++---
 src/utils/candidate_map.rs           | 133 +++++++++-
 src/utils/inverted_index.rs          |  26 +-
 4 files changed, 483 insertions(+), 57 deletions(-)

diff --git a/.gitignore b/.gitignore
index 488e813..d4e6d87 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,4 +16,5 @@ outputs/
 src/phases/alt_parse.rs //Got broken by refactoring but is not in use currently, so ignoring for now
 *.zip
 *.tar.gz
-result/
\ No newline at end of file
+result/
+examples/
\ No newline at end of file
diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs
index 78ccd3d..4ea31b9 100644
--- a/src/phases/type_3_duplicate_files.rs
+++ b/src/phases/type_3_duplicate_files.rs
@@ -1,4 +1,5 @@
 use crate::phases::tokenizer::global_counter;
+use crate::utils::candidate_map::*;
 use crate::utils::fs::*;
 use crate::utils::inverted_index::*;
 use crate::utils::logger::Logger;
@@ -8,8 +9,8 @@ use blake3;
 use clap::{Arg, Command};
 use core::f64;
 use polars::prelude::*;
-use std::cmp::Reverse;
-use std::collections::HashMap;
+use std::cmp::{max, min, Reverse};
+use std::collections::{HashMap, HashSet};
 use std::vec;
 use tracing::info;
 
@@ -158,10 +159,33 @@ pub fn run(
             (n_functions_after_loc as f64 / n_functions_before_loc as f64 * 100.0) as usize
         }
     );
+
+    //moved here from detect_clones
+    let paths_column = input_file.column("path")?.str()?;
+    let words_column = input_file.column("words")?.u32()?;
+    let rows: Vec<(&str, usize)> = paths_column
+        .into_iter()
+        .zip(words_column)
+        .filter_map(|(path_opt, words_opt)| match (path_opt, words_opt) {
+            (Some(path), Some(words)) => Some((path, words as usize)),
+            _ => None,
+        })
+        .collect();
+
+    let function_paths_and_lengths: HashMap<blake3::Hash, (&str, usize)> = rows
+        .iter()
+        .map(|(path, words)| (blake3::hash(path.as_bytes()), (*path, *words)))
+        .collect();
+
     let global_bow = global_counter(&input_file)?;
     let token_rankings = global_bow.token_rankings();
-    let vector_of_indices_plus_min_max =
-        index_builder(&input_file, token_rankings, p_prefix, threshold)?;
+    let vector_of_indices_plus_min_max = index_builder(
+        &input_file,
+        &token_rankings,
+        p_prefix,
+        threshold,
+        &function_paths_and_lengths,
+    )?;
     // Maximum and minimum 'words' in input file
     let vector_of_indices = vector_of_indices_plus_min_max.0;
     let min_words = vector_of_indices_plus_min_max.1 .0;
@@ -190,8 +214,11 @@ pub fn run(
                     String::from_utf8_lossy(&word),
                     index_number
                 );
-                for (function_id, count) in entries {
-                    info!("Function ID: {}, Count: {}", function_id, count);
+                for (function_id, count, (token_position, cumulative_count)) in entries {
+                    info!(
+                        "Function ID: {}, Count: {}, Token Position: {}, Cumulative Count: {}",
+                        function_id, count, token_position, cumulative_count
+                    );
                 }
             } else {
                 info!(
@@ -205,15 +232,25 @@ pub fn run(
     }
 
     //go through input file again? Means i can grab 'words' from the file. Could do something like just checking candidates
-
+    let clone_map = detect_clones(
+        &token_rankings,
+        &vector_of_indices,
+        threshold,
+        &function_paths_and_lengths,
+    )?;
+    info!(
+        "Finished detecting clones. {} unique files found.",
+        clone_map.len()
+    );
     Ok(())
 }
 
 fn index_builder(
     input_file: &DataFrame,
-    token_rankings: HashMap<Vec<u8>, (usize, usize)>,
+    token_rankings: &HashMap<Vec<u8>, (usize, usize)>,
     p_prefix: usize,
     threshold: f64,
+    function_paths_and_lengths: &HashMap<blake3::Hash, (&str, usize)>,
 ) -> Result<(Vec<InvertedIndex>, (usize, usize))> {
     let word_matcher: Matcher = Matcher::words_matcher();
 
@@ -243,8 +280,12 @@ fn index_builder(
                             .unwrap_or(usize::MAX),
                     )
                 });
-                let codeblock_length = vectored_bow.iter().map(|(_, count)| count).sum::<usize>();
-
+                //let codeblock_length = vectored_bow.iter().map(|(_, count)| count).sum::<usize>();
+                let codeblock_length = function_paths_and_lengths
+                    .get(&blake3::hash(path.as_bytes()))
+                    .map(|(_, count)| *count)
+                    .unwrap_or(0);
+                // Could probably rewrite this to get number of words from the input file instead of calculating it here
                 // Min and Max codeblock are in number of words, not tokens but seeing as they're only used for estimating verification cost they don't need to be precise
                 if codeblock_length < min_words {
                     min_words = codeblock_length;
@@ -252,9 +293,6 @@ fn index_builder(
                 if codeblock_length > max_words {
                     max_words = codeblock_length;
                 }
-                let _verification_cost_per_candidate_estimate =
-                    (min_words as f64 + max_words as f64) / 2.0;
-                //Temporarily shut off so the compiler doesn't complain about unused variables, will be used later
                 let prefix_length =
                     codeblock_length - ((codeblock_length as f64) * threshold).round() as usize + 1;
 
@@ -263,9 +301,9 @@ fn index_builder(
                 let function_id = blake3::hash(path.as_bytes());
                 //info!("Prefix length: {}, total tokens: {}, codeblock length: {}", prefix_length, vectored_bow.len(), codeblock_length);
 
-                for (token, count) in vectored_bow {
+                for (idx, (token, count)) in vectored_bow.iter().enumerate() {
                     cumulative_count += count;
-                    vector_of_indices[p - 1].add(&token, count, function_id);
+                    vector_of_indices[p - 1].add(token, function_id, *count, idx, cumulative_count);
                     if cumulative_count >= prefix_length {
                         if p == p_prefix {
                             //info!("Prefix scheme {} added token {} with count {}", p, String::from_utf8_lossy(&token), count);
@@ -288,37 +326,301 @@ fn index_builder(
     Ok((vector_of_indices, (min_words, max_words)))
 }
 
-/* fn delta_filter_cost(
-    prefix_vector: &Vec<(Vec<u8>, usize)>,
-    vector_of_indices: &Vec<InvertedIndex>,
+fn delta_filter_cost(
+    token_tuple: &(Vec<u8>, usize),
+    vector_of_indices: &[InvertedIndex], //changed from &Vec<InvertedIndex> since the compiler requested it
     p_prefix: usize,
-    &previous_cost: &usize,
+    new: bool,
 ) -> usize {
+    let token = &token_tuple.0;
     let mut cost = 0;
-    if p_prefix == 1 {
-        for (token, _) in prefix_vector {
-            cost += vector_of_indices[0].token_frequency(token, false);
+    if new {
+        //if the token is new to the prefix, we need to count its frequency in all previous delta indices
+        for p in 1..=p_prefix {
+            cost += vector_of_indices[p - 1].token_frequency(token, false);
         }
     } else {
-        let last_token = prefix_vector.last().unwrap().0.clone();
-        for (token, _) in prefix_vector {
-            cost += vector_of_indices[p_prefix - 1].token_frequency(token, false);
+        //just count the frequency in the new delta index
+        cost += vector_of_indices[p_prefix - 1].token_frequency(token, false);
+    }
+    cost
+}
+
+fn weighted_prefix_end(vectored_bow: &[(Vec<u8>, usize)], prefix_length: usize) -> usize {
+    if prefix_length == 0 {
+        info!("Prefix length is 0, returning 0 for weighted prefix end.");
+        // This case shouldn't be seen
+        return 0;
+    }
+    let mut cumulative_count = 0usize;
+    for (idx, (_, count)) in vectored_bow.iter().enumerate() {
+        cumulative_count += *count;
+        if cumulative_count >= prefix_length {
+            return idx + 1; //Enumerator is 0-based, so we need to add 1 to get the correct length of the prefix vector
         }
-        for p in 1..(p_prefix - 1) {
-            // the previous for-loop already counted the last token for the current inverted_index
-            cost += vector_of_indices[p - 1].token_frequency(&last_token, false);
+    }
+    info!("Warning: prefix_length {} is greater than total token count {}, returning full length of vectored_bow.", prefix_length, vectored_bow.len());
+    vectored_bow.len()
+}
+
+fn detect_clones(
+    token_rankings: &HashMap<Vec<u8>, (usize, usize)>,
+    vector_of_indices: &[InvertedIndex], //changed from &Vec<InvertedIndex> since the compiler requested it
+    threshold: f64,
+    function_paths_and_lengths: &HashMap<blake3::Hash, (&str, usize)>,
+) -> Result<HashMap<blake3::Hash, HashSet<blake3::Hash>>> {
+    // result will probably be a 'clone-map'. Unsure for now if it has to be its own data-structure or if i can reuse the candidate map from before.
+    let mut clone_map: HashMap<blake3::Hash, HashSet<blake3::Hash>> = HashMap::new(); //key is the original function id, value is a set of clones of that function
+
+    let word_matcher: Matcher = Matcher::words_matcher();
+    let p_prefix = vector_of_indices.len();
+    for (path, origin_word_count) in function_paths_and_lengths.values() {
+        info!("Path: {}, Words: {}", path, origin_word_count);
+        match load_file(path, 1024 * 1024 * 1024) {
+            Ok(Ok(function_code)) => {
+                let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase());
+                let mut origin_vectored_bow = local_bow.vectorize();
+                origin_vectored_bow.sort_by_key(|(token, _)| {
+                    Reverse(
+                        token_rankings
+                            .get(token)
+                            .map(|(_, rank)| *rank)
+                            .unwrap_or(usize::MAX),
+                    )
+                });
+                let origin_function_id = blake3::hash(path.as_bytes());
+                let mut candidate_map = CandidateMap::new();
+
+                let prefix_length = origin_word_count
+                    - ((*origin_word_count as f64) * threshold).round() as usize
+                    + 1;
+
+                let init_prefix_end = weighted_prefix_end(&origin_vectored_bow, prefix_length);
+                let mut filter_cost_vector: Vec<usize> = Vec::new();
+                filter_cost_vector.push(0); //cost of prefix scheme 1 is calculated from an empty prefix, so the initial cost is 0
+                let mut verification_cost_vector: Vec<usize> = Vec::new();
+                verification_cost_vector.push(0); //verification cost is estimated as 0 for the first prefix scheme since we haven't seen any candidates yet,
+                let mut total_cost_vector: Vec<usize> = Vec::new();
+                total_cost_vector.push(usize::MAX); //total cost is initially set to max since so 0-prefix can never be chosen as the best prefix scheme
+                                                    // big loop, will be used for the different prefix schemes
+                let mut origin_cumulative_count = 0usize;
+                'prefix_schemes: for p in 1..=p_prefix {
+                    let mut filter_cost = filter_cost_vector[p - 1]; // start with the filter cost of the previous prefix scheme
+                    let prefix_end = init_prefix_end + p - 1; //the prefix end for the current scheme is at least the prefix end of the first scheme + the number of tokens in the prefix - 1 (since p-prefix is at least 1)
+
+                    for (idx, token_tuple) in
+                        origin_vectored_bow.iter().take(prefix_end).enumerate()
+                    {
+                        //loop through the prefix vector of the current scheme, for the first scheme this is just the original prefix vector, for the next schemes this includes additional tokens
+                        let is_new = idx + 1 == prefix_end;
+                        origin_cumulative_count += token_tuple.1;
+                        filter_cost += delta_filter_cost(token_tuple, vector_of_indices, p, is_new);
+                        for candidate in vector_of_indices[p - 1]
+                            .get(&token_tuple.0)
+                            .unwrap_or(&Vec::new())
+                        {
+                            let candidate_word_count = function_paths_and_lengths
+                                .get(&candidate.0)
+                                .map(|(_, count)| *count)
+                                .unwrap_or(0);
+
+                            if candidate_word_count
+                                > ((*origin_word_count as f64) * threshold).round() as usize
+                            {
+                                let new_matches = min(token_tuple.1, candidate.1);
+                                let function_id = candidate.0;
+                                let last_token_seen_pos = candidate.2; // (token_position, cumulative_count)
+                                let current_threshold =
+                                    (max(*origin_word_count, candidate_word_count) as f64
+                                        * threshold)
+                                        .round() as usize;
+                                let upper_bound = min(
+                                    *origin_word_count - origin_cumulative_count,
+                                    candidate_word_count - last_token_seen_pos.1 + new_matches, //candidate.2.1 is the number of words seen up to and including this token including duplicates
+                                );
+                                if candidate_map.get_token_matches(&function_id) + upper_bound
+                                    >= current_threshold
+                                {
+                                    candidate_map.add_pending_update(
+                                        function_id,
+                                        new_matches,
+                                        last_token_seen_pos,
+                                    );
+                                }
+                            }
+                        }
+                        filter_cost_vector.push(filter_cost);
+                        verification_cost_vector.push(candidate_map.verification_cost_estimate(p));
+                        total_cost_vector.push(filter_cost + verification_cost_vector[p]);
+
+                        if total_cost_vector[p] > total_cost_vector[p - 1] {
+                            info!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]);
+                            //return verify_candidates(candidate_map, path, function_code, p - 1); //Need to keep in mind if candidate map is already updated with new prefix scheme
+                            verify_candidates(
+                                origin_function_id,
+                                &origin_vectored_bow,
+                                (idx, origin_cumulative_count),
+                                &mut candidate_map,
+                                &mut clone_map,
+                                p_prefix,
+                                token_rankings,
+                                threshold,
+                                function_paths_and_lengths,
+                            )?;
+                            break 'prefix_schemes;
+                        } else {
+                            //apply updates
+                            candidate_map.apply_pending_updates(function_paths_and_lengths);
+                            if p == p_prefix {
+                                //return verify_candidates(candidate_map, path, function_code, p);
+                                verify_candidates(
+                                    origin_function_id,
+                                    &origin_vectored_bow,
+                                    (idx, origin_cumulative_count),
+                                    &mut candidate_map,
+                                    &mut clone_map,
+                                    p_prefix,
+                                    token_rankings,
+                                    threshold,
+                                    function_paths_and_lengths,
+                                )?;
+                                break 'prefix_schemes;
+                            }
+                        }
+                    }
+                }
+            }
+            Ok(Err(_e)) => {
+                info!("Warning: File too large at path '{}', skipping.", path);
+            }
+            Err(_e) => {
+                info!("Failed to read file at path '{}', skipping.", path);
+            }
         }
     }
-    let total_cost = previous_cost + cost;
-    total_cost
-} */
 
-/* fn clone_detection(
-    input_file: &DataFrame,
-    token_rankings: HashMap<Vec<u8>, (usize, usize)>,
-    vector_of_indices: &Vec<InvertedIndex>,
+    Ok(clone_map)
+}
+
+fn verify_candidates(
+    origin_function_id: blake3::Hash,
+    origin_vectored_bow: &Vec<(Vec<u8>, usize)>,
+    origin_last_token_seen_pos: (usize, usize),
+    candidate_map: &mut CandidateMap,
+    clone_map: &mut HashMap<blake3::Hash, HashSet<blake3::Hash>>,
+    p_prefix: usize,
+    token_rankings: &HashMap<Vec<u8>, (usize, usize)>,
     threshold: f64,
-) -> Result<()> { // result will probably be a 'clone-map'. Unsure for now if it has to be its own data-structure or if i can reuse the candidate map from before.
-    // This is where the actual clone detection happens, currently not implemented
+    function_paths_and_lengths: &HashMap<blake3::Hash, (&str, usize)>,
+) -> Result<()> {
+    // This function will take the candidate map for a function and verify the candidates that have enough matches
+    // to be considered clones based on their full token vectors.
+    // The clone_map is updated with the results, mapping original function ids to sets of clone function ids.
+    let word_matcher: Matcher = Matcher::words_matcher();
+    let origin_word_count = function_paths_and_lengths
+        .get(&origin_function_id)
+        .map(|(_, count)| *count)
+        .unwrap_or(0);
+    let origin_vectored_bow = origin_vectored_bow.to_owned();
+    let origin_token_count = origin_vectored_bow.len();
+    let candidates_to_verify = candidate_map.get_candidates_with_n_matches(p_prefix, "at_least");
+    let mut origin_last_token_seen_pos = origin_last_token_seen_pos; // (token_position, cumulative_count)
+    for candidate_id in candidates_to_verify {
+        let (path, length) = function_paths_and_lengths
+            .get(&candidate_id)
+            .copied()
+            .unwrap();
+        match load_file(path, 1024 * 1024 * 1024) {
+            Ok(Ok(candidate_code)) => {
+                // Handle successful file load
+                // load function, sort tokens by global frequency, calculate similarity, if above threshold add to clone map
+                let candidate_bow = word_matcher.bag_of_words(&candidate_code.to_ascii_lowercase());
+                let mut vectored_candidate_bow = candidate_bow.vectorize();
+                vectored_candidate_bow.sort_by_key(|(token, _)| {
+                    Reverse(
+                        token_rankings
+                            .get(token)
+                            .map(|(_, rank)| *rank)
+                            .unwrap_or(usize::MAX),
+                    )
+                });
+                let candidate_word_count = length;
+                let candidate_token_count = vectored_candidate_bow.len();
+                let current_threshold = (max(origin_word_count, candidate_word_count) as f64
+                    * threshold)
+                    .round() as usize;
+                let mut candidate_last_token_seen_pos =
+                    candidate_map.get_last_token_seen_pos(&candidate_id); // (token_position, cumulative_count)
+                let mut new_matches = 0usize;
+                while origin_last_token_seen_pos.0 < origin_token_count
+                    && candidate_last_token_seen_pos.0 < candidate_token_count
+                {
+                    if min(
+                        origin_token_count - origin_last_token_seen_pos.1,
+                        candidate_token_count - candidate_last_token_seen_pos.1,
+                    ) > current_threshold
+                    {
+                        let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0];
+                        let candidate_token_tuple =
+                            &vectored_candidate_bow[candidate_last_token_seen_pos.0];
+                        if origin_token_tuple.0 == candidate_token_tuple.0 {
+                            //it's a match
+                            new_matches += min(origin_token_tuple.1, candidate_token_tuple.1);
+                            candidate_last_token_seen_pos.0 += 1;
+                            candidate_last_token_seen_pos.1 += candidate_token_tuple.1;
+                            origin_last_token_seen_pos.0 += 1;
+                            origin_last_token_seen_pos.1 += origin_token_tuple.1;
+                        } else if token_rankings
+                            .get(&origin_token_tuple.0)
+                            .map(|(_, rank)| *rank)
+                            .unwrap_or(usize::MAX)
+                            < token_rankings
+                                .get(&candidate_token_tuple.0)
+                                .map(|(_, rank)| *rank)
+                                .unwrap_or(usize::MAX)
+                        {
+                            //origin token is more frequent than candidate token, so we move in the origin vector
+                            origin_last_token_seen_pos.0 += 1;
+                            origin_last_token_seen_pos.1 += origin_token_tuple.1;
+                        } else {
+                            //candidate token is more frequent than origin token, so we move in the candidate vector
+                            candidate_last_token_seen_pos.0 += 1;
+                            candidate_last_token_seen_pos.1 += candidate_token_tuple.1;
+                        }
+                    } else {
+                        //the upper bound of the remaining matches is not enough to reach the threshold, so we can stop comparing this candidate
+                        break;
+                    }
+                }
+                candidate_map.add_candidate(
+                    candidate_id,
+                    function_paths_and_lengths,
+                    new_matches,
+                    candidate_last_token_seen_pos,
+                );
+                if candidate_map.get_token_matches(&candidate_id) >= current_threshold {
+                    //add to clone map
+                    clone_map
+                        .entry(origin_function_id)
+                        .or_default()
+                        .insert(candidate_id);
+                    info!(
+                        "Clone detected! Original: {}, Candidate: {}, Similarity: {:.2} %",
+                        origin_function_id,
+                        candidate_id,
+                        (candidate_map.get_token_matches(&candidate_id) as f64
+                            / max(origin_word_count, candidate_word_count) as f64)
+                            * 100.0
+                    );
+                }
+            }
+            Ok(Err(_)) => {
+                info!("Warning: File too large at path '{}', skipping.", path);
+            }
+            Err(_) => {
+                info!("Failed to read file at path '{}', skipping.", path);
+            }
+        }
+    }
     Ok(())
-} */
+}
diff --git a/src/utils/candidate_map.rs b/src/utils/candidate_map.rs
index 995ba30..32bb9eb 100644
--- a/src/utils/candidate_map.rs
+++ b/src/utils/candidate_map.rs
@@ -1,14 +1,18 @@
+use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 
 pub struct CandidateEntry {
     pub matches: usize,
     pub length: usize,
-    pub last_token_seen_pos: usize,
+    pub last_token_seen_pos: (usize, usize), // (token_position, cumulative_count)
 }
 
 pub struct CandidateMap {
     entries: HashMap<blake3::Hash, CandidateEntry>,
     match_histogram: HashMap<usize, HashSet<blake3::Hash>>,
+    pending_updates: Vec<(blake3::Hash, usize, (usize, usize))>, // (function_id, new_matches, last_token_seen_pos)
+    min_length: usize,
+    max_length: usize,
 }
 
 impl Default for CandidateMap {
@@ -22,21 +26,68 @@ impl CandidateMap {
         Self {
             entries: HashMap::new(),
             match_histogram: HashMap::new(),
+            min_length: usize::MAX,
+            max_length: 0,
+            pending_updates: Vec::new(),
+        }
+    }
+
+    pub fn get_token_matches(&self, function_id: &blake3::Hash) -> usize {
+        self.entries
+            .get(function_id)
+            .map(|entry| entry.matches)
+            .unwrap_or(0)
+    }
+
+    pub fn add_pending_update(
+        &mut self,
+        function_id: blake3::Hash,
+        new_matches: usize,
+        last_token_seen_pos: (usize, usize),
+    ) {
+        self.pending_updates
+            .push((function_id, new_matches, last_token_seen_pos));
+    }
+
+    pub fn apply_pending_updates(
+        &mut self,
+        function_paths_and_lengths: &HashMap<blake3::Hash, (&str, usize)>,
+    ) {
+        let updates = self.pending_updates.drain(..).collect::<Vec<_>>();
+        for (function_id, new_matches, last_token_seen_pos) in updates {
+            self.add_candidate(
+                function_id,
+                function_paths_and_lengths,
+                new_matches,
+                last_token_seen_pos,
+            );
         }
     }
 
     pub fn add_candidate(
         &mut self,
         function_id: blake3::Hash,
-        length: usize,
+        function_paths_and_lengths: &std::collections::HashMap<blake3::Hash, (&str, usize)>,
         new_matches: usize,
-        last_token_seen_pos: usize,
+        last_token_seen_pos: (usize, usize),
     ) {
-        let entry = self.entries.entry(function_id).or_insert(CandidateEntry {
-            matches: 0,
-            length,
-            last_token_seen_pos,
-        });
+        let entry = match self.entries.entry(function_id) {
+            Entry::Occupied(occupied) => occupied.into_mut(),
+            Entry::Vacant(vacant) => {
+                let length = function_paths_and_lengths
+                    .get(&function_id)
+                    .map(|(_, count)| *count)
+                    .unwrap_or(0);
+                let last_token_seen_pos = (0, 0); // Initialize to (0, 0) for new candidates
+                self.min_length = self.min_length.min(length);
+                self.max_length = self.max_length.max(length);
+                vacant.insert(CandidateEntry {
+                    matches: 0,
+                    length,
+                    last_token_seen_pos,
+                })
+            }
+        };
 
         // Update the match histogram
         if entry.matches > 0 {
@@ -46,15 +97,52 @@ impl CandidateMap {
         }
 
         entry.matches += new_matches;
-        entry.length = length;
         entry.last_token_seen_pos = last_token_seen_pos;
-
         self.match_histogram
             .entry(entry.matches)
             .or_default()
             .insert(function_id);
     }
 
+    pub fn length_range(&self) -> Option<(usize, usize)> {
+        if self.entries.is_empty() {
+            None
+        } else {
+            Some((self.min_length, self.max_length))
+        }
+    }
+
+    pub fn get_candidates_with_n_matches(&self, n: usize, mode: &str) -> HashSet<blake3::Hash> {
+        if mode == "exact" {
+            self.match_histogram.get(&n).cloned().unwrap_or_default()
+        } else if mode == "at_least" {
+            self.match_histogram
+                .iter()
+                .filter(|(&matches, _)| matches >= n)
+                .flat_map(|(_, bucket)| bucket.clone())
+                .collect()
+        } else {
+            panic!("Invalid mode: {}", mode);
+        }
+    }
+
+    pub fn get_last_token_seen_pos(&self, function_id: &blake3::Hash) -> (usize, usize) {
+        self.entries
+            .get(function_id)
+            .map(|entry| entry.last_token_seen_pos)
+            .unwrap_or((0, 0))
+    }
+
+    pub fn update_last_token_seen_pos(
+        &mut self,
+        function_id: &blake3::Hash,
+        new_pos: (usize, usize),
+    ) {
+        if let Some(entry) = self.entries.get_mut(function_id) {
+            entry.last_token_seen_pos = new_pos;
+        }
+    }
+
     pub fn count_candidates_with_n_matches(&self, n: usize, mode: &str) -> usize {
         if mode == "exact" {
             self.match_histogram
@@ -71,4 +159,29 @@ impl CandidateMap {
             panic!("Invalid mode: {}", mode);
         }
     }
+
+    pub fn verification_cost_estimate(&self, n: usize) -> usize {
+        let mut number_of_candidates = self.count_candidates_with_n_matches(n, "at_least"); //the candidates that have already reached n matches
+
+        let mut survivors = 0usize;
+        for candidate in &self.pending_updates {
+            let function_id = candidate.0;
+            let current_matches = self.get_token_matches(&function_id);
+            if current_matches == n - 1 {
+                survivors += 1;
+            }
+        }
+        number_of_candidates += survivors; //add the candidates that are about to reach n matches
+                                           // I am disregarding the candidates with less than n-1 matches that will also reach n_matches due to new_matches>1
+                                           // But as I understand it they should always satisfy property 1
+                                           // A candidate doesn't get to come back after being eliminated once
+                                           // Also it's a very rare edge case
+        let length_range = self.length_range().unwrap_or((usize::MAX, 0));
+        let average_length = if length_range.0 == usize::MAX {
+            0
+        } else {
+            (length_range.0 + length_range.1) / 2
+        };
+        number_of_candidates * average_length
+    }
 }
diff --git a/src/utils/inverted_index.rs b/src/utils/inverted_index.rs
index a976fd3..5be6334 100644
--- a/src/utils/inverted_index.rs
+++ b/src/utils/inverted_index.rs
@@ -1,7 +1,7 @@
 use blake3::Hash;
 use std::collections::HashMap;
 pub struct InvertedIndex {
-    map: HashMap<Vec<u8>, Vec<(Hash, usize)>>, // Maps tokens to a list of function IDs where they appear as well as the frequency of the token in that function
+    map: HashMap<Vec<u8>, Vec<(Hash, usize, (usize, usize))>>, // token -> Vec<(function_id, count, (token_position, cumulative_count))>
 }
 
 impl Default for InvertedIndex {
@@ -17,14 +17,24 @@ impl InvertedIndex {
         }
     }
 
-    pub fn add(&mut self, token: &Vec<u8>, count: usize, function_id: Hash) {
-        self.map
-            .entry(token.to_owned())
-            .or_default()
-            .push((function_id, count));
+    pub fn add(
+        &mut self,
+        token: &Vec<u8>,
+        function_id: Hash,
+        count: usize,
+        token_position: usize,
+        cumulative_count: usize,
+    ) {
+        //token_position is the index of the token.
+        // cumulative_count is the number of words seen up to and including this token including duplicates
+        self.map.entry(token.to_owned()).or_default().push((
+            function_id,
+            count,
+            (token_position, cumulative_count),
+        ));
     }
 
-    pub fn get(&self, token: &Vec<u8>) -> Option<&Vec<(Hash, usize)>> {
+    pub fn get(&self, token: &Vec<u8>) -> Option<&Vec<(Hash, usize, (usize, usize))>> {
         self.map.get(token)
     }
 
@@ -43,7 +53,7 @@ impl InvertedIndex {
     pub fn token_frequency(&self, token: &Vec<u8>, count_duplicates: bool) -> usize {
         if let Some(functions) = self.get(token) {
             if count_duplicates {
-                functions.iter().map(|(_, count)| *count).sum()
+                functions.iter().map(|(_, count, _)| *count).sum()
             } else {
                 functions.len()
             }

From 710fd6777eaded4347de49b5bf94a534c8ec59a2 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Wed, 8 Apr 2026 13:48:05 +0200
Subject: [PATCH 09/14] fixed cost estimation logic

---
 src/phases/type_3_duplicate_files.rs | 74 +++++++++++++++++-----------
 src/utils/candidate_map.rs           |  3 +-
 2 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs
index 4ea31b9..75f37cf 100644
--- a/src/phases/type_3_duplicate_files.rs
+++ b/src/phases/type_3_duplicate_files.rs
@@ -239,7 +239,7 @@ pub fn run(
         &function_paths_and_lengths,
     )?;
     info!(
-        "Finished detecting clones. {} unique files found.",
+        "Finished detecting clones. {} clones found.",
         clone_map.len()
     );
     Ok(())
@@ -375,7 +375,8 @@ fn detect_clones(
     let word_matcher: Matcher = Matcher::words_matcher();
     let p_prefix = vector_of_indices.len();
     for (path, origin_word_count) in function_paths_and_lengths.values() {
-        info!("Path: {}, Words: {}", path, origin_word_count);
+        info!("-----------------------------------------------------------------------------");
+        // info!("Path: {}, Words: {}", path, origin_word_count);
         match load_file(path, 1024 * 1024 * 1024) {
             Ok(Ok(function_code)) => {
                 let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase());
@@ -404,15 +405,19 @@ fn detect_clones(
                 total_cost_vector.push(usize::MAX); //total cost is initially set to max since so 0-prefix can never be chosen as the best prefix scheme
                                                     // big loop, will be used for the different prefix schemes
                 let mut origin_cumulative_count = 0usize;
+                let mut origin_token_position = 0usize;
                 'prefix_schemes: for p in 1..=p_prefix {
                     let mut filter_cost = filter_cost_vector[p - 1]; // start with the filter cost of the previous prefix scheme
                     let prefix_end = init_prefix_end + p - 1; //the prefix end for the current scheme is at least the prefix end of the first scheme + the number of tokens in the prefix - 1 (since p-prefix is at least 1)
 
-                    for (idx, token_tuple) in
-                        origin_vectored_bow.iter().take(prefix_end).enumerate()
+                    /* for (idx, token_tuple) in
+                    origin_vectored_bow.iter().take(prefix_end).enumerate() */
+                    while origin_token_position < prefix_end
+                        && origin_token_position < origin_vectored_bow.len()
                     {
+                        let token_tuple = origin_vectored_bow.get(origin_token_position).unwrap();
                         //loop through the prefix vector of the current scheme, for the first scheme this is just the original prefix vector, for the next schemes this includes additional tokens
-                        let is_new = idx + 1 == prefix_end;
+                        let is_new = origin_token_position + 1 == prefix_end;
                         origin_cumulative_count += token_tuple.1;
                         filter_cost += delta_filter_cost(token_tuple, vector_of_indices, p, is_new);
                         for candidate in vector_of_indices[p - 1]
@@ -449,17 +454,41 @@ fn detect_clones(
                                 }
                             }
                         }
-                        filter_cost_vector.push(filter_cost);
-                        verification_cost_vector.push(candidate_map.verification_cost_estimate(p));
-                        total_cost_vector.push(filter_cost + verification_cost_vector[p]);
+                        origin_token_position += 1;
+                    }
+                    if p == 1 {
+                        candidate_map.apply_pending_updates(function_paths_and_lengths);
+                        //apply updates for the first prefix scheme before estimating costs since it relies on min/max length
+                    }
+                    let verification_cost = candidate_map.verification_cost_estimate(p);
+                    filter_cost_vector.push(filter_cost);
+                    verification_cost_vector.push(verification_cost);
+                    total_cost_vector.push(filter_cost + verification_cost);
 
-                        if total_cost_vector[p] > total_cost_vector[p - 1] {
-                            info!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]);
-                            //return verify_candidates(candidate_map, path, function_code, p - 1); //Need to keep in mind if candidate map is already updated with new prefix scheme
+                    if total_cost_vector[p] > total_cost_vector[p - 1] {
+                        info!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]);
+                        info!("The next prefix scheme {} has estimated total cost of {}, filter cost: {}, verification cost: {}.", p, total_cost_vector[p], filter_cost_vector[p], verification_cost_vector[p]);
+                        verify_candidates(
+                            origin_function_id,
+                            &origin_vectored_bow,
+                            (origin_token_position, origin_cumulative_count),
+                            &mut candidate_map,
+                            &mut clone_map,
+                            p_prefix,
+                            token_rankings,
+                            threshold,
+                            function_paths_and_lengths,
+                        )?;
+                        break 'prefix_schemes;
+                    } else {
+                        //apply updates
+                        candidate_map.apply_pending_updates(function_paths_and_lengths);
+                        if p == p_prefix {
+                            //return verify_candidates(candidate_map, path, function_code, p);
                             verify_candidates(
                                 origin_function_id,
                                 &origin_vectored_bow,
-                                (idx, origin_cumulative_count),
+                                (origin_token_position, origin_cumulative_count),
                                 &mut candidate_map,
                                 &mut clone_map,
                                 p_prefix,
@@ -468,27 +497,12 @@ fn detect_clones(
                                 function_paths_and_lengths,
                             )?;
                             break 'prefix_schemes;
-                        } else {
-                            //apply updates
-                            candidate_map.apply_pending_updates(function_paths_and_lengths);
-                            if p == p_prefix {
-                                //return verify_candidates(candidate_map, path, function_code, p);
-                                verify_candidates(
-                                    origin_function_id,
-                                    &origin_vectored_bow,
-                                    (idx, origin_cumulative_count),
-                                    &mut candidate_map,
-                                    &mut clone_map,
-                                    p_prefix,
-                                    token_rankings,
-                                    threshold,
-                                    function_paths_and_lengths,
-                                )?;
-                                break 'prefix_schemes;
-                            }
                         }
                     }
                 }
+                info!("Filter cost vector: {:?}", filter_cost_vector);
+                info!("Verification cost vector: {:?}", verification_cost_vector);
+                info!("Total cost vector: {:?}", total_cost_vector);
             }
             Ok(Err(_e)) => {
                 info!("Warning: File too large at path '{}', skipping.", path);
diff --git a/src/utils/candidate_map.rs b/src/utils/candidate_map.rs
index 32bb9eb..52bee53 100644
--- a/src/utils/candidate_map.rs
+++ b/src/utils/candidate_map.rs
@@ -167,7 +167,8 @@ impl CandidateMap {
         for candidate in &self.pending_updates {
             let function_id = candidate.0;
             let current_matches = self.get_token_matches(&function_id);
-            if current_matches == n - 1 {
+            if n > 1 && current_matches == n - 1 {
+                // if n==1 the pending list is empty as they have already been applied
                 survivors += 1;
             }
         }

From 72fe49812acae2fdb45dbee3abc9e5c990b5e139 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Sun, 12 Apr 2026 13:59:33 +0200
Subject: [PATCH 10/14] temp for migrating

---
 src/bin/main.rs                               |  1 +
 src/phases/parse.rs                           |  2 +
 src/phases/type_3_duplicate_files.rs          | 75 +++++++++++++++----
 .../type_3_duplicate_files/files/original.py  |  6 ++
 .../files/original.py.functions/1-1           |  6 ++
 .../type_3_duplicate_files/files/type_1.py    | 11 +++
 .../files/type_1.py.functions/1-1             | 11 +++
 .../type_3_duplicate_files/files/type_2.py    | 12 +++
 .../files/type_2.py.functions/1-1             | 12 +++
 .../type_3_duplicate_files/files/type_3.py    | 11 +++
 .../files/type_3.py.functions/1-1             | 11 +++
 .../type_3_duplicate_files/parser_log.csv     |  5 ++
 .../phases/type_3_duplicate_files/python.json | 12 +++
 .../test_parser_input.csv                     |  5 ++
 .../test_parser_output.functions.csv          |  5 ++
 15 files changed, 171 insertions(+), 14 deletions(-)
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/original.py
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_1.py
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_2.py
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_3.py
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1
 create mode 100644 tests/data/phases/type_3_duplicate_files/parser_log.csv
 create mode 100644 tests/data/phases/type_3_duplicate_files/python.json
 create mode 100644 tests/data/phases/type_3_duplicate_files/test_parser_input.csv
 create mode 100644 tests/data/phases/type_3_duplicate_files/test_parser_output.functions.csv

diff --git a/src/bin/main.rs b/src/bin/main.rs
index 6d5a78c..715e788 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -267,6 +267,7 @@ fn main() {
                                     cli_subargs.get_one::<String>("map").map(|x| x.as_str()),
                                     cli_subargs.get_one::<String>("logs").map(|x| x.as_str()),
                                     /* languages */
+                                    cli_subargs.get_one::<String>("language").map(|s| s.as_str()),
                                     *cli_subargs.get_one::<usize>("threads").unwrap(),
                                     *cli_subargs.get_one::<usize>("p_prefix").unwrap(),
                                     *cli_subargs.get_one::<f64>("threshold").unwrap(),
diff --git a/src/phases/parse.rs b/src/phases/parse.rs
index 8ed9661..cabaf25 100644
--- a/src/phases/parse.rs
+++ b/src/phases/parse.rs
@@ -455,6 +455,7 @@ fn analyze_file(
     ignore_comments: bool,
     word_counter: &Matcher,
 ) -> Result<(String, Option<String>)> {
+    info!("analyze_file called with path: {path}");
     let grammar = language_to_grammar(language)
         .with_context(|| format!("Unsupported language: {language}"))?;
     // Initializes the parser
@@ -462,6 +463,7 @@ fn analyze_file(
     parser.set_language(&grammar.lang)?;
     match load_file(path, 1024 * 1024 * 1024)? {
         Ok(source_code) => {
+            info!("File {path} loaded successfully");
             // Creates a folder to store the functions of the file
             let target_folder: String = format!("{path}.functions");
             create_dir(&target_folder)?;
diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs
index 75f37cf..8a95635 100644
--- a/src/phases/type_3_duplicate_files.rs
+++ b/src/phases/type_3_duplicate_files.rs
@@ -47,10 +47,11 @@ pub fn cli() -> Command {
                 .required(false),
         )
         .arg(
-            Arg::new("languages")
+            Arg::new("language")
                 .short('g')
                 .long("languages")
-                .help("Comma-separated list of languages to consider. If not provided, all languages will be considered.")
+                /* .help("Comma-separated list of languages to consider. If not provided, all languages will be considered.") */
+                .help("language as a string, e.g. 'java'. If not provided, defaults to 'java'. TODO")
                 .required(false),
         )
         .arg(
@@ -91,13 +92,15 @@ pub fn run(
     _map_path: Option<&str>, //optional path to the map CSV file to store the mapping of clones to their originals.
     _logs_path: Option<&str>, //for error logs, not implemented yet
     /* _opt_languages: Option<Vec<&str>>, //optional list of languages. Currently java is hardcoded */
+    opt_language: Option<&str>,
     _threads: usize,               //current implementation is single-threaded
     p_prefix: usize,               //number of tokens to consider for the prefix, default is 1
     threshold: f64,                //threshold for the prefix length, default is 0.8
     example_word: Option<&String>, //an example word to check the global Bag of Words for, optional
     _logger: &Logger,
 ) -> Result<()> {
-    let language = "java";
+    //let language = "java";
+    let language = opt_language.unwrap_or("java"); //default to java currently
     let minimum_loc = 5; //temporary
     let mut input_file = open_csv(
         input_path,
@@ -531,10 +534,14 @@ fn verify_candidates(
     // to be considered clones based on their full token vectors.
     // The clone_map is updated with the results, mapping original function ids to sets of clone function ids.
     let word_matcher: Matcher = Matcher::words_matcher();
-    let origin_word_count = function_paths_and_lengths
+    let (origin_path, origin_word_count) = function_paths_and_lengths
         .get(&origin_function_id)
-        .map(|(_, count)| *count)
-        .unwrap_or(0);
+        .copied()
+        .unwrap_or(("Unknown", 0));
+    info!(
+        "Verifying candidates for function at path '{}', with word count {}.",
+        origin_path, origin_word_count
+    );
     let origin_vectored_bow = origin_vectored_bow.to_owned();
     let origin_token_count = origin_vectored_bow.len();
     let candidates_to_verify = candidate_map.get_candidates_with_n_matches(p_prefix, "at_least");
@@ -544,6 +551,9 @@ fn verify_candidates(
             .get(&candidate_id)
             .copied()
             .unwrap();
+        if candidate_id == origin_function_id {
+            continue; //skip comparing the function to itself
+        }
         match load_file(path, 1024 * 1024 * 1024) {
             Ok(Ok(candidate_code)) => {
                 // Handle successful file load
@@ -563,22 +573,31 @@ fn verify_candidates(
                 let current_threshold = (max(origin_word_count, candidate_word_count) as f64
                     * threshold)
                     .round() as usize;
+                info!("Current threshold: {}", current_threshold);
                 let mut candidate_last_token_seen_pos =
                     candidate_map.get_last_token_seen_pos(&candidate_id); // (token_position, cumulative_count)
                 let mut new_matches = 0usize;
                 while origin_last_token_seen_pos.0 < origin_token_count
                     && candidate_last_token_seen_pos.0 < candidate_token_count
                 {
-                    if min(
-                        origin_token_count - origin_last_token_seen_pos.1,
-                        candidate_token_count - candidate_last_token_seen_pos.1,
-                    ) > current_threshold
-                    {
+                    let upper_bound = min(
+                        origin_word_count - origin_last_token_seen_pos.1,
+                        candidate_word_count - candidate_last_token_seen_pos.1,
+                    ) + candidate_map.get_token_matches(&candidate_id);
+
+                    if upper_bound > current_threshold {
+                        info!("IF MIN");
                         let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0];
                         let candidate_token_tuple =
                             &vectored_candidate_bow[candidate_last_token_seen_pos.0];
                         if origin_token_tuple.0 == candidate_token_tuple.0 {
                             //it's a match
+                            info!("MATCHING!");
+                            info!(
+                                "MATCH! origin: {}, candidate: {}",
+                                String::from_utf8_lossy(&origin_token_tuple.0),
+                                String::from_utf8_lossy(&candidate_token_tuple.0)
+                            );
                             new_matches += min(origin_token_tuple.1, candidate_token_tuple.1);
                             candidate_last_token_seen_pos.0 += 1;
                             candidate_last_token_seen_pos.1 += candidate_token_tuple.1;
@@ -594,15 +613,41 @@ fn verify_candidates(
                                 .unwrap_or(usize::MAX)
                         {
                             //origin token is more frequent than candidate token, so we move in the origin vector
+                            info!(
+                                "origin_count > candidate_count: origin: {}, candidate: {}",
+                                String::from_utf8_lossy(&origin_token_tuple.0),
+                                String::from_utf8_lossy(&candidate_token_tuple.0)
+                            );
                             origin_last_token_seen_pos.0 += 1;
                             origin_last_token_seen_pos.1 += origin_token_tuple.1;
                         } else {
                             //candidate token is more frequent than origin token, so we move in the candidate vector
+                            info!(
+                                "candidate_count > origin_count: origin: {}, candidate: {}",
+                                String::from_utf8_lossy(&origin_token_tuple.0),
+                                String::from_utf8_lossy(&candidate_token_tuple.0)
+                            );
                             candidate_last_token_seen_pos.0 += 1;
                             candidate_last_token_seen_pos.1 += candidate_token_tuple.1;
                         }
                     } else {
                         //the upper bound of the remaining matches is not enough to reach the threshold, so we can stop comparing this candidate
+                        /* info!("UPPER BOUND of remaining matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", min(origin_word_count - origin_last_token_seen_pos.1, candidate_word_count - candidate_last_token_seen_pos.1), current_threshold);
+                        info!(
+                            "Current matches: {}, new matches: {}, total possible matches: {}",
+                            candidate_map.get_token_matches(&candidate_id),
+                            new_matches,
+                            candidate_map.get_token_matches(&candidate_id)
+                                + min(
+                                    origin_word_count - origin_last_token_seen_pos.1,
+                                    candidate_word_count - candidate_last_token_seen_pos.1
+                                )
+                        ); */
+                        info!("UPPER BOUND of remaining matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound, current_threshold);
+                        info!(
+                            "origin_last_token_seen_pos: {}, candidate_last_token_seen_pos: {}",
+                            origin_last_token_seen_pos.0, candidate_last_token_seen_pos.0
+                        );
                         break;
                     }
                 }
@@ -619,9 +664,11 @@ fn verify_candidates(
                         .or_default()
                         .insert(candidate_id);
                     info!(
-                        "Clone detected! Original: {}, Candidate: {}, Similarity: {:.2} %",
-                        origin_function_id,
-                        candidate_id,
+                        "Clone detected! Candidate: {}, Similarity: {:.2} %",
+                        function_paths_and_lengths
+                            .get(&candidate_id)
+                            .map(|(path, _)| *path)
+                            .unwrap_or("Unknown"),
                         (candidate_map.get_token_matches(&candidate_id) as f64
                             / max(origin_word_count, candidate_word_count) as f64)
                             * 100.0
diff --git a/tests/data/phases/type_3_duplicate_files/files/original.py b/tests/data/phases/type_3_duplicate_files/files/original.py
new file mode 100644
index 0000000..cec103e
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/original.py
@@ -0,0 +1,6 @@
+def code_block_1(): 
+    example_array = ["telephone", "dog", "example", "banana", "apple"]
+    # This is the original code block the others are compared to.
+    for word in example_array:
+        print(f"Word: {word}")
+    print("Done")
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1 b/tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1
new file mode 100644
index 0000000..6dbf785
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1
@@ -0,0 +1,6 @@
+def code_block_1(): 
+    example_array = ["telephone", "dog", "example", "banana", "apple"]
+    
+    for word in example_array:
+        print(f"Word: {word}")
+    print("Done")
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/type_1.py b/tests/data/phases/type_3_duplicate_files/files/type_1.py
new file mode 100644
index 0000000..ba84af1
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/type_1.py
@@ -0,0 +1,11 @@
+def code_block_1(): 
+    # Type-1 Clone
+    # This code block is identical to the original code block with some exceptions
+    # The comment is different and it is formatted differently.
+    example_array = ["telephone", "dog", "example", "banana", "apple"]
+
+    for word in example_array:
+
+        print(f"Word: {word}")
+
+    print("Done")
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1 b/tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1
new file mode 100644
index 0000000..8ba21b7
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1
@@ -0,0 +1,11 @@
+def code_block_1(): 
+    
+    
+    
+    example_array = ["telephone", "dog", "example", "banana", "apple"]
+
+    for word in example_array:
+
+        print(f"Word: {word}")
+
+    print("Done")
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/type_2.py b/tests/data/phases/type_3_duplicate_files/files/type_2.py
new file mode 100644
index 0000000..91eda16
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/type_2.py
@@ -0,0 +1,12 @@
+def code_block_2():
+    # Type-2 Clone
+    # This code block is identical to the original code block with some exceptions
+    # In addition to the type-1 changes, the variable name are changed and the function name is changed.
+    # One of the literal values has also changed. "telephone" -> "computer".
+    my_array = ["computer", "dog", "example", "banana", "apple"]
+
+    for item in my_array:
+
+        print(f"Word: {item}")
+
+    print("Done")
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1 b/tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1
new file mode 100644
index 0000000..03556b6
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1
@@ -0,0 +1,12 @@
+def code_block_2():
+    
+    
+    
+    
+    my_array = ["computer", "dog", "example", "banana", "apple"]
+
+    for item in my_array:
+
+        print(f"Word: {item}")
+
+    print("Done")
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/type_3.py b/tests/data/phases/type_3_duplicate_files/files/type_3.py
new file mode 100644
index 0000000..74098d4
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/type_3.py
@@ -0,0 +1,11 @@
+def code_block_3():
+    # Type-3 Clone
+    # This code block is similar to the original code block but with some differences.
+    # In addition to the type-2 changes, it differs at the statement level.
+    # An 'append' statement has been added and the print statement at the end has been removed.
+    my_array = ["computer", "dog", "example", "banana", "apple"]
+    my_array.append("grape")
+
+    for item in my_array:
+
+        print(f"Word: {item}")
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1 b/tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1
new file mode 100644
index 0000000..c264039
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1
@@ -0,0 +1,11 @@
+def code_block_3():
+    
+    
+    
+    
+    my_array = ["computer", "dog", "example", "banana", "apple"]
+    my_array.append("grape")
+
+    for item in my_array:
+
+        print(f"Word: {item}")
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/parser_log.csv b/tests/data/phases/type_3_duplicate_files/parser_log.csv
new file mode 100644
index 0000000..872d26f
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/parser_log.csv
@@ -0,0 +1,5 @@
+id,name,language,functions,functions_with_kw,tests/data/phases/type_3_duplicate_files/python.json,parse_error
+1,tests/data/phases/type_3_duplicate_files/files/type_1.py,python,1,1,1,none
+2,tests/data/phases/type_3_duplicate_files/files/type_2.py,python,1,1,1,none
+0,tests/data/phases/type_3_duplicate_files/files/original.py,python,1,1,1,none
+3,tests/data/phases/type_3_duplicate_files/files/type_3.py,python,1,1,1,none
diff --git a/tests/data/phases/type_3_duplicate_files/python.json b/tests/data/phases/type_3_duplicate_files/python.json
new file mode 100644
index 0000000..2be199d
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/python.json
@@ -0,0 +1,12 @@
+{
+    "languages": [
+        {
+            "name": "python",
+            "extensions" : [
+                "py"
+            ],
+            "keywords" : []
+        }
+    ],
+    "keywords": []
+}
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/test_parser_input.csv b/tests/data/phases/type_3_duplicate_files/test_parser_input.csv
new file mode 100644
index 0000000..0c10ab7
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/test_parser_input.csv
@@ -0,0 +1,5 @@
+id,name,language
+0,tests/data/phases/type_3_duplicate_files/files/original.py,python
+1,tests/data/phases/type_3_duplicate_files/files/type_1.py,python
+2,tests/data/phases/type_3_duplicate_files/files/type_2.py,python
+3,tests/data/phases/type_3_duplicate_files/files/type_3.py,python
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/test_parser_output.functions.csv b/tests/data/phases/type_3_duplicate_files/test_parser_output.functions.csv
new file mode 100644
index 0000000..6780f28
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/test_parser_output.functions.csv
@@ -0,0 +1,5 @@
+id,path,name,position,language,loc,words,tests/data/phases/type_3_duplicate_files/python.json,loop_statements,loop_nestings,if_statements,if_nestings,functions_calls,function_calls_nestings,params,param_kw_match,parse_error
+1,tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1,code_block_1,1:1,python,11,18,9,1,1,0,0,2,1,0,0,none
+2,tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1,code_block_2,1:1,python,12,18,9,1,1,0,0,2,1,0,0,none
+0,tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1,code_block_1,1:1,python,6,18,9,1,1,0,0,2,1,0,0,none
+3,tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1,code_block_3,1:1,python,11,19,10,1,1,0,0,2,1,0,0,none

From a576665b67766d4f308f97727389f8c364de25a4 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Thu, 23 Apr 2026 13:59:58 +0200
Subject: [PATCH 11/14] switched order of token ranking

---
 Cargo.lock                           |   1 +
 Cargo.toml                           |   1 +
 src/phases/tokenizer.rs              |   2 +-
 src/phases/type_3_duplicate_files.rs | 259 ++++++++++++++++-----------
 src/utils/bow.rs                     |   4 +-
 src/utils/candidate_map.rs           |   4 +-
 6 files changed, 163 insertions(+), 108 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index fd1dadb..2dff1a4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2807,6 +2807,7 @@ dependencies = [
  "crossbeam-channel 0.5.15",
  "csv",
  "curl",
+ "either",
  "indicatif",
  "json",
  "lazy_static",
diff --git a/Cargo.toml b/Cargo.toml
index c7c90f9..1106684 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,6 +34,7 @@ crossbeam="0.7"
 crossbeam-channel="0.5.0"
 csv="1.1"
 curl="0.4"
+either = "1.15.0"
 indicatif = "0.17.9"
 json="0.12"
 lazy_static = "1.4.0"
diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs
index 89f41ee..684adda 100644
--- a/src/phases/tokenizer.rs
+++ b/src/phases/tokenizer.rs
@@ -117,7 +117,7 @@ pub fn run(input_path: &str, example_word: &str, _logger: &Logger) -> Result<()>
         example_word
     );
     info!(
-        "  The token '{}' appears {} times and is ranked {} in the global Bag of Words.",
+        "  The token '{}' appears {} times and is ranked {} in the global Bag of Words (rank 1 = least common token).",
         example_word,
         token_rankings
             .get(example_word_token)
diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs
index 8a95635..1a44e87 100644
--- a/src/phases/type_3_duplicate_files.rs
+++ b/src/phases/type_3_duplicate_files.rs
@@ -8,12 +8,15 @@ use anyhow::{/* Error,  */ Result};
 use blake3;
 use clap::{Arg, Command};
 use core::f64;
+use either::Either;
 use polars::prelude::*;
-use std::cmp::{max, min, Reverse};
+use std::cmp::{max, min};
 use std::collections::{HashMap, HashSet};
 use std::vec;
 use tracing::info;
 
+type CloneMap = HashMap<blake3::Hash, Either<HashSet<blake3::Hash>, blake3::Hash>>;
+
 pub fn cli() -> Command {
     Command::new("type_3_duplicate_files")
         .about("Detects type 3 clones by building an index based on the most common tokens in the functions and their frequencies.")
@@ -101,7 +104,7 @@ pub fn run(
 ) -> Result<()> {
     //let language = "java";
     let language = opt_language.unwrap_or("java"); //default to java currently
-    let minimum_loc = 5; //temporary
+    let minimum_loc = 2; //temporary
     let mut input_file = open_csv(
         input_path,
         Some(Schema::from_iter(vec![
@@ -276,12 +279,10 @@ fn index_builder(
                 let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase());
                 let mut vectored_bow = local_bow.vectorize();
                 vectored_bow.sort_by_key(|(token, _)| {
-                    Reverse(
-                        token_rankings
-                            .get(token)
-                            .map(|(_, rank)| *rank)
-                            .unwrap_or(usize::MAX),
-                    )
+                    token_rankings
+                        .get(token)
+                        .map(|(_, rank)| *rank)
+                        .unwrap_or(usize::MAX)
                 });
                 //let codeblock_length = vectored_bow.iter().map(|(_, count)| count).sum::<usize>();
                 let codeblock_length = function_paths_and_lengths
@@ -371,9 +372,9 @@ fn detect_clones(
     vector_of_indices: &[InvertedIndex], //changed from &Vec<InvertedIndex> since the compiler requested it
     threshold: f64,
     function_paths_and_lengths: &HashMap<blake3::Hash, (&str, usize)>,
-) -> Result<HashMap<blake3::Hash, HashSet<blake3::Hash>>> {
+) -> Result<CloneMap> {
     // result will probably be a 'clone-map'. Unsure for now if it has to be its own data-structure or if i can reuse the candidate map from before.
-    let mut clone_map: HashMap<blake3::Hash, HashSet<blake3::Hash>> = HashMap::new(); //key is the original function id, value is a set of clones of that function
+    let mut clone_map: CloneMap = HashMap::new();
 
     let word_matcher: Matcher = Matcher::words_matcher();
     let p_prefix = vector_of_indices.len();
@@ -385,14 +386,13 @@ fn detect_clones(
                 let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase());
                 let mut origin_vectored_bow = local_bow.vectorize();
                 origin_vectored_bow.sort_by_key(|(token, _)| {
-                    Reverse(
-                        token_rankings
-                            .get(token)
-                            .map(|(_, rank)| *rank)
-                            .unwrap_or(usize::MAX),
-                    )
+                    token_rankings
+                        .get(token)
+                        .map(|(_, rank)| *rank)
+                        .unwrap_or(usize::MAX)
                 });
                 let origin_function_id = blake3::hash(path.as_bytes());
+                info!("Origin path: {}", path);
                 let mut candidate_map = CandidateMap::new();
 
                 let prefix_length = origin_word_count
@@ -427,34 +427,43 @@ fn detect_clones(
                             .get(&token_tuple.0)
                             .unwrap_or(&Vec::new())
                         {
+                            /* if candidate_id_lt_origin_id(&candidate.0, &origin_function_id) {
+                                info!("DClone: SKIPPING candidate at path '{}' since it has a lower function ID than the origin.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown"));
+                                continue; //skip candidates that have already been processed as origins
+                            } */
+                            if clone_map.contains_key(&candidate.0) {
+                                info!("DClone: SKIPPING candidate at path '{}' since it already has an entry in clone_map.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown"));
+                                continue;
+                            }
                             let candidate_word_count = function_paths_and_lengths
                                 .get(&candidate.0)
                                 .map(|(_, count)| *count)
                                 .unwrap_or(0);
 
                             if candidate_word_count
-                                > ((*origin_word_count as f64) * threshold).round() as usize
+                                < ((*origin_word_count as f64) * threshold).round() as usize
+                            {
+                                continue; //skip candidates that are too small to reach the threshold
+                            }
+
+                            let new_matches = min(token_tuple.1, candidate.1);
+                            let function_id = candidate.0;
+                            let last_token_seen_pos = candidate.2; // (token_position, cumulative_count)
+                            let current_threshold =
+                                (max(*origin_word_count, candidate_word_count) as f64 * threshold)
+                                    .round() as usize;
+                            let upper_bound = min(
+                                *origin_word_count - origin_cumulative_count,
+                                candidate_word_count - last_token_seen_pos.1 + new_matches, //candidate.2.1 is the number of words seen up to and including this token including duplicates
+                            );
+                            if candidate_map.get_token_matches(&function_id) + upper_bound
+                                >= current_threshold
                             {
-                                let new_matches = min(token_tuple.1, candidate.1);
-                                let function_id = candidate.0;
-                                let last_token_seen_pos = candidate.2; // (token_position, cumulative_count)
-                                let current_threshold =
-                                    (max(*origin_word_count, candidate_word_count) as f64
-                                        * threshold)
-                                        .round() as usize;
-                                let upper_bound = min(
-                                    *origin_word_count - origin_cumulative_count,
-                                    candidate_word_count - last_token_seen_pos.1 + new_matches, //candidate.2.1 is the number of words seen up to and including this token including duplicates
+                                candidate_map.add_pending_update(
+                                    function_id,
+                                    new_matches,
+                                    last_token_seen_pos,
                                 );
-                                if candidate_map.get_token_matches(&function_id) + upper_bound
-                                    >= current_threshold
-                                {
-                                    candidate_map.add_pending_update(
-                                        function_id,
-                                        new_matches,
-                                        last_token_seen_pos,
-                                    );
-                                }
                             }
                         }
                         origin_token_position += 1;
@@ -463,7 +472,8 @@ fn detect_clones(
                         candidate_map.apply_pending_updates(function_paths_and_lengths);
                         //apply updates for the first prefix scheme before estimating costs since it relies on min/max length
                     }
-                    let verification_cost = candidate_map.verification_cost_estimate(p);
+                    let verification_cost =
+                        candidate_map.verification_cost_estimate(p, origin_word_count);
                     filter_cost_vector.push(filter_cost);
                     verification_cost_vector.push(verification_cost);
                     total_cost_vector.push(filter_cost + verification_cost);
@@ -522,9 +532,9 @@ fn detect_clones(
 fn verify_candidates(
     origin_function_id: blake3::Hash,
     origin_vectored_bow: &Vec<(Vec<u8>, usize)>,
-    origin_last_token_seen_pos: (usize, usize),
+    prefix_origin_last_token_seen_pos: (usize, usize),
     candidate_map: &mut CandidateMap,
-    clone_map: &mut HashMap<blake3::Hash, HashSet<blake3::Hash>>,
+    clone_map: &mut CloneMap,
     p_prefix: usize,
     token_rankings: &HashMap<Vec<u8>, (usize, usize)>,
     threshold: f64,
@@ -544,106 +554,127 @@ fn verify_candidates(
     );
     let origin_vectored_bow = origin_vectored_bow.to_owned();
     let origin_token_count = origin_vectored_bow.len();
-    let candidates_to_verify = candidate_map.get_candidates_with_n_matches(p_prefix, "at_least");
-    let mut origin_last_token_seen_pos = origin_last_token_seen_pos; // (token_position, cumulative_count)
+    let candidates_to_verify = candidate_map.get_candidates_with_n_matches(p_prefix, "at_least"); // (token_position, cumulative_count)
+    let origin_vector_readable: Vec<(String, usize)> = origin_vectored_bow
+        .iter()
+        .map(|(token, count)| (String::from_utf8_lossy(token).to_string(), *count))
+        .collect();
+    info!("sorted origin vector: {:?}", origin_vector_readable);
+    info!("origin_function_id: {:?}", origin_function_id);
+    info!("origin_id as bytes: {:?}", origin_function_id.as_bytes());
     for candidate_id in candidates_to_verify {
+        info!("----------------------");
         let (path, length) = function_paths_and_lengths
             .get(&candidate_id)
             .copied()
             .unwrap();
+        info!("candidate_id as bytes: {:?}", candidate_id.as_bytes());
+        if clone_map.contains_key(&candidate_id) {
+            info!(
+                "SKIPPING candidate at path '{}' since it already has an entry in clone_map.",
+                path
+            );
+            continue;
+        }
         if candidate_id == origin_function_id {
+            info!("Skipping self-comparison for function at path '{}'.", path);
             continue; //skip comparing the function to itself
         }
+        let mut origin_last_token_seen_pos = prefix_origin_last_token_seen_pos;
         match load_file(path, 1024 * 1024 * 1024) {
             Ok(Ok(candidate_code)) => {
                 // Handle successful file load
                 // load function, sort tokens by global frequency, calculate similarity, if above threshold add to clone map
+                info!("Candidate loaded: {}, length: {}", path, length);
                 let candidate_bow = word_matcher.bag_of_words(&candidate_code.to_ascii_lowercase());
                 let mut vectored_candidate_bow = candidate_bow.vectorize();
                 vectored_candidate_bow.sort_by_key(|(token, _)| {
-                    Reverse(
-                        token_rankings
-                            .get(token)
-                            .map(|(_, rank)| *rank)
-                            .unwrap_or(usize::MAX),
-                    )
+                    token_rankings
+                        .get(token)
+                        .map(|(_, rank)| *rank)
+                        .unwrap_or(usize::MAX)
                 });
+                let candidate_vector_readable: Vec<(String, usize)> = vectored_candidate_bow
+                    .iter()
+                    .map(|(token, count)| (String::from_utf8_lossy(token).to_string(), *count))
+                    .collect();
+                info!("sorted candidate vector: {:?}", candidate_vector_readable);
                 let candidate_word_count = length;
                 let candidate_token_count = vectored_candidate_bow.len();
                 let current_threshold = (max(origin_word_count, candidate_word_count) as f64
                     * threshold)
                     .round() as usize;
-                info!("Current threshold: {}", current_threshold);
                 let mut candidate_last_token_seen_pos =
                     candidate_map.get_last_token_seen_pos(&candidate_id); // (token_position, cumulative_count)
                 let mut new_matches = 0usize;
+                let prefix_matches = candidate_map.get_token_matches(&candidate_id);
                 while origin_last_token_seen_pos.0 < origin_token_count
                     && candidate_last_token_seen_pos.0 < candidate_token_count
                 {
                     let upper_bound = min(
                         origin_word_count - origin_last_token_seen_pos.1,
                         candidate_word_count - candidate_last_token_seen_pos.1,
-                    ) + candidate_map.get_token_matches(&candidate_id);
+                    );
+                    let current_matches = prefix_matches + new_matches;
+                    let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0];
+                    let candidate_token_tuple =
+                        &vectored_candidate_bow[candidate_last_token_seen_pos.0];
+
+                    info!("Current threshold: {}", current_threshold);
+                    info!(
+                        "Current matches: {} + {} = {}",
+                        prefix_matches, new_matches, current_matches
+                    );
+                    info!("Upper bound of remaining matches: {}", upper_bound);
+
+                    let origin_rank = token_rankings
+                        .get(&origin_token_tuple.0)
+                        .map(|(_, rank)| *rank)
+                        .unwrap_or(usize::MAX);
+                    let candidate_rank = token_rankings
+                        .get(&candidate_token_tuple.0)
+                        .map(|(_, rank)| *rank)
+                        .unwrap_or(usize::MAX);
 
-                    if upper_bound > current_threshold {
-                        info!("IF MIN");
-                        let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0];
-                        let candidate_token_tuple =
-                            &vectored_candidate_bow[candidate_last_token_seen_pos.0];
+                    info!(
+                        "Origin: {}, rank: {}, position: {} | Candidate: {}, rank: {}, position: {}",
+                        String::from_utf8_lossy(&origin_token_tuple.0),
+                        origin_rank,
+                        origin_last_token_seen_pos.0,
+                        String::from_utf8_lossy(&candidate_token_tuple.0),
+                        candidate_rank,
+                        candidate_last_token_seen_pos.0
+                    );
+
+                    if current_matches >= current_threshold {
+                        //already reached the threshold, we can stop comparing this candidate and add it to the clone map
+                        info!(
+                            "Threshold reached with current matches {}, adding to clone map.",
+                            current_matches
+                        );
+                        break;
+                    } else if upper_bound + current_matches >= current_threshold {
                         if origin_token_tuple.0 == candidate_token_tuple.0 {
                             //it's a match
                             info!("MATCHING!");
-                            info!(
-                                "MATCH! origin: {}, candidate: {}",
-                                String::from_utf8_lossy(&origin_token_tuple.0),
-                                String::from_utf8_lossy(&candidate_token_tuple.0)
-                            );
                             new_matches += min(origin_token_tuple.1, candidate_token_tuple.1);
                             candidate_last_token_seen_pos.0 += 1;
                             candidate_last_token_seen_pos.1 += candidate_token_tuple.1;
                             origin_last_token_seen_pos.0 += 1;
                             origin_last_token_seen_pos.1 += origin_token_tuple.1;
-                        } else if token_rankings
-                            .get(&origin_token_tuple.0)
-                            .map(|(_, rank)| *rank)
-                            .unwrap_or(usize::MAX)
-                            < token_rankings
-                                .get(&candidate_token_tuple.0)
-                                .map(|(_, rank)| *rank)
-                                .unwrap_or(usize::MAX)
-                        {
-                            //origin token is more frequent than candidate token, so we move in the origin vector
-                            info!(
-                                "origin_count > candidate_count: origin: {}, candidate: {}",
-                                String::from_utf8_lossy(&origin_token_tuple.0),
-                                String::from_utf8_lossy(&candidate_token_tuple.0)
-                            );
-                            origin_last_token_seen_pos.0 += 1;
-                            origin_last_token_seen_pos.1 += origin_token_tuple.1;
-                        } else {
-                            //candidate token is more frequent than origin token, so we move in the candidate vector
-                            info!(
-                                "candidate_count > origin_count: origin: {}, candidate: {}",
-                                String::from_utf8_lossy(&origin_token_tuple.0),
-                                String::from_utf8_lossy(&candidate_token_tuple.0)
-                            );
+                        } else if origin_rank > candidate_rank {
+                            //origin token is more frequent than candidate token, so we move in the candidate vector
+                            info!("origin_count > candidate_count");
                             candidate_last_token_seen_pos.0 += 1;
-                            candidate_last_token_seen_pos.1 += candidate_token_tuple.1;
+                            candidate_last_token_seen_pos.1 += origin_token_tuple.1;
+                        } else {
+                            //candidate token is more frequent than origin token, so we move in the origin vector
+                            info!("candidate_count > origin_count");
+                            origin_last_token_seen_pos.0 += 1;
+                            origin_last_token_seen_pos.1 += candidate_token_tuple.1;
                         }
                     } else {
-                        //the upper bound of the remaining matches is not enough to reach the threshold, so we can stop comparing this candidate
-                        /* info!("UPPER BOUND of remaining matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", min(origin_word_count - origin_last_token_seen_pos.1, candidate_word_count - candidate_last_token_seen_pos.1), current_threshold);
-                        info!(
-                            "Current matches: {}, new matches: {}, total possible matches: {}",
-                            candidate_map.get_token_matches(&candidate_id),
-                            new_matches,
-                            candidate_map.get_token_matches(&candidate_id)
-                                + min(
-                                    origin_word_count - origin_last_token_seen_pos.1,
-                                    candidate_word_count - candidate_last_token_seen_pos.1
-                                )
-                        ); */
-                        info!("UPPER BOUND of remaining matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound, current_threshold);
+                        info!("UPPER BOUND + current_matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound + current_matches, current_threshold);
                         info!(
                             "origin_last_token_seen_pos: {}, candidate_last_token_seen_pos: {}",
                             origin_last_token_seen_pos.0, candidate_last_token_seen_pos.0
@@ -658,13 +689,14 @@ fn verify_candidates(
                     candidate_last_token_seen_pos,
                 );
                 if candidate_map.get_token_matches(&candidate_id) >= current_threshold {
-                    //add to clone map
-                    clone_map
-                        .entry(origin_function_id)
-                        .or_default()
-                        .insert(candidate_id);
+                    insert_clone_relation(clone_map, origin_function_id, candidate_id);
+                    info!("*** CLONE DETECTED! ***");
                     info!(
-                        "Clone detected! Candidate: {}, Similarity: {:.2} %",
+                        "Origin: {}, Candidate: {}, Similarity >= {:.2} %",
+                        function_paths_and_lengths
+                            .get(&origin_function_id)
+                            .map(|(path, _)| *path)
+                            .unwrap_or("Unknown"),
                         function_paths_and_lengths
                             .get(&candidate_id)
                             .map(|(path, _)| *path)
@@ -674,6 +706,7 @@ fn verify_candidates(
                             * 100.0
                     );
                 }
+                info!("**********")
             }
             Ok(Err(_)) => {
                 info!("Warning: File too large at path '{}', skipping.", path);
@@ -685,3 +718,23 @@ fn verify_candidates(
     }
     Ok(())
 }
+
+fn insert_clone_relation(
+    clone_map: &mut CloneMap,
+    origin_function_id: blake3::Hash,
+    candidate_id: blake3::Hash,
+) {
+    let origin_entry = clone_map
+        .entry(origin_function_id)
+        .or_insert_with(|| Either::Left(HashSet::new()));
+
+    // Origin must always store the set of its clones as Left(HashSet<_>).
+    if let Either::Left(clones) = origin_entry {
+        clones.insert(candidate_id);
+    } else {
+        *origin_entry = Either::Left(HashSet::from([candidate_id]));
+    }
+
+    // Clone points back to its origin as Right(origin_hash).
+    clone_map.insert(candidate_id, Either::Right(origin_function_id));
+}
diff --git a/src/utils/bow.rs b/src/utils/bow.rs
index 897b0a8..1dc087d 100644
--- a/src/utils/bow.rs
+++ b/src/utils/bow.rs
@@ -99,9 +99,9 @@ impl Bow {
     pub fn token_rankings(&self) -> HashMap<Vec<u8>, (usize, usize)> {
         let mut rankings: HashMap<Vec<u8>, (usize, usize)> = HashMap::new();
         let mut count_vec: Vec<(&Vec<u8>, &usize)> = self.map.iter().collect();
-        //count_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count in descending order
+        //count_vec.sort_by(|a, b| a.1.cmp(b.1)); // Sort by count in ascending order
         count_vec.sort_by(|a, b| {
-            b.1.cmp(a.1) // primary: count descending
+            a.1.cmp(b.1) // primary: count ascending
                 .then_with(|| a.0.cmp(b.0)) // secondary: token ascending
         });
         for (rank, (token, count)) in count_vec.into_iter().enumerate() {
diff --git a/src/utils/candidate_map.rs b/src/utils/candidate_map.rs
index 52bee53..06d8198 100644
--- a/src/utils/candidate_map.rs
+++ b/src/utils/candidate_map.rs
@@ -160,7 +160,7 @@ impl CandidateMap {
         }
     }
 
-    pub fn verification_cost_estimate(&self, n: usize) -> usize {
+    pub fn verification_cost_estimate(&self, n: usize, origin_word_count: &usize) -> usize {
         let mut number_of_candidates = self.count_candidates_with_n_matches(n, "at_least"); //the candidates that have already reached n matches
 
         let mut survivors = 0usize;
@@ -183,6 +183,6 @@ impl CandidateMap {
         } else {
             (length_range.0 + length_range.1) / 2
         };
-        number_of_candidates * average_length
+        number_of_candidates * (*origin_word_count + average_length)
     }
 }

From 424991b475f659e78122e5b8e74b0e3df91697d4 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Mon, 27 Apr 2026 14:40:31 +0200
Subject: [PATCH 12/14] java example functions added

---
 tests/data/keywords/java.json                             | 3 +++
 .../files/java_examples/CB1/factorial.java                | 4 ++++
 .../files/java_examples/CB1/factorial.java.functions/2-1  | 3 +++
 .../files/java_examples/CB2/factorial.java                | 8 ++++++++
 .../files/java_examples/CB2/factorial.java.functions/2-1  | 7 +++++++
 .../files/java_examples/CB3/factorial.java                | 8 ++++++++
 .../files/java_examples/CB3/factorial.java.functions/2-1  | 7 +++++++
 .../files/java_examples/CB4/main.java                     | 8 ++++++++
 .../files/java_examples/CB4/main.java.functions/2-1       | 7 +++++++
 .../files/java_examples/CB5/factorial.java                | 6 ++++++
 .../files/java_examples/CB5/factorial.java.functions/2-1  | 5 +++++
 11 files changed, 66 insertions(+)
 create mode 100644 tests/data/keywords/java.json
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java.functions/2-1
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java.functions/2-1
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java.functions/2-1
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java.functions/2-1

diff --git a/tests/data/keywords/java.json b/tests/data/keywords/java.json
new file mode 100644
index 0000000..e0f964a
--- /dev/null
+++ b/tests/data/keywords/java.json
@@ -0,0 +1,3 @@
+{
+    "languages": ["java"]
+}
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java
new file mode 100644
index 0000000..71d2783
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java
@@ -0,0 +1,4 @@
+//Code Block 1 (CB1)
+public static int factorial(int result) {
+  if(result <= 1) return 1;
+  return result * factorial(result-1); }
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1
new file mode 100644
index 0000000..07a6e64
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1
@@ -0,0 +1,3 @@
+public static int factorial(int result) {
+  if(result <= 1) return 1;
+  return result * factorial(result-1); }
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java
new file mode 100644
index 0000000..85cffd4
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java
@@ -0,0 +1,8 @@
+//Code Block 2 (CB2)
+public static int factorial(int n) {
+    int result = 1;
+    for(int i=1; i<=n; i++) {
+        result = result * i;
+    }
+    return result;
+}
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java.functions/2-1
new file mode 100644
index 0000000..9c06340
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java.functions/2-1
@@ -0,0 +1,7 @@
+public static int factorial(int n) {
+    int result = 1;
+    for(int i=1; i<=n; i++) {
+        result = result * i;
+    }
+    return result;
+}
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java
new file mode 100644
index 0000000..6de0bc4
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java
@@ -0,0 +1,8 @@
+//Code Block 3 (CB3)
+public static int factorial(int n) {
+  if(n >= 0) {
+    result[0] = 1;
+    for(int i=1; i<=n; i++) {
+      result[i] = i * result[i-1];
+    }
+    return result[n]; } }
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java.functions/2-1
new file mode 100644
index 0000000..725c461
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java.functions/2-1
@@ -0,0 +1,7 @@
+public static int factorial(int n) {
+  if(n >= 0) {
+    result[0] = 1;
+    for(int i=1; i<=n; i++) {
+      result[i] = i * result[i-1];
+    }
+    return result[n]; } }
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java
new file mode 100644
index 0000000..87d59bb
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java
@@ -0,0 +1,8 @@
+//Code Block 4 (CB4)
+public static void main(String[] args) {
+    int result = 5;
+    int factorial = result;
+    for(int i=result-1; i>1; i--) {
+        factorial = factorial * i;
+    }
+}
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java.functions/2-1
new file mode 100644
index 0000000..02d6edf
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java.functions/2-1
@@ -0,0 +1,7 @@
+public static void main(String[] args) {
+    int result = 5;
+    int factorial = result;
+    for(int i=result-1; i>1; i--) {
+        factorial = factorial * i;
+    }
+}
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java
new file mode 100644
index 0000000..2ed89f7
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java
@@ -0,0 +1,6 @@
+//Code Block 5 (CB5)
+public int factorial(int result) {
+    if(result == 0) {
+        return 1;
+    } else {
+        return result * factorial(result-1); } }
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java.functions/2-1
new file mode 100644
index 0000000..cd86a1e
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java.functions/2-1
@@ -0,0 +1,5 @@
+public int factorial(int result) {
+    if(result == 0) {
+        return 1;
+    } else {
+        return result * factorial(result-1); } }
\ No newline at end of file

From 8334756271b77b27b96775748d67e81a4643c593 Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Mon, 27 Apr 2026 15:05:28 +0200
Subject: [PATCH 13/14] switched info! to debug! or warn!

---
 src/phases/type_3_duplicate_files.rs | 88 ++++++++++++++--------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs
index 1a44e87..b3337cc 100644
--- a/src/phases/type_3_duplicate_files.rs
+++ b/src/phases/type_3_duplicate_files.rs
@@ -13,7 +13,7 @@ use polars::prelude::*;
 use std::cmp::{max, min};
 use std::collections::{HashMap, HashSet};
 use std::vec;
-use tracing::info;
+use tracing::{debug, info, warn};
 
 type CloneMap = HashMap<blake3::Hash, Either<HashSet<blake3::Hash>, blake3::Hash>>;
 
@@ -208,26 +208,26 @@ pub fn run(
         let word = word.to_owned().as_bytes().to_ascii_lowercase();
         let mut index_number = 1;
         for index in vector_of_indices.iter() {
-            info!(
+            debug!(
                 "Index {} has {} entries, total length of vectors in entries: {}",
                 index_number,
                 index.len(),
                 index.len_tokens()
             );
             if let Some(entries) = index.get(&word) {
-                info!(
+                debug!(
                     "Entries for the example word '{}' in index {}§:",
                     String::from_utf8_lossy(&word),
                     index_number
                 );
                 for (function_id, count, (token_position, cumulative_count)) in entries {
-                    info!(
+                    debug!(
                         "Function ID: {}, Count: {}, Token Position: {}, Cumulative Count: {}",
                         function_id, count, token_position, cumulative_count
                     );
                 }
             } else {
-                info!(
+                debug!(
                     "The example word '{}' was not found in index {}.",
                     String::from_utf8_lossy(&word),
                     index_number
@@ -319,10 +319,10 @@ fn index_builder(
                 }
             }
             Ok(Err(_e)) => {
-                info!("Warning: File too large at path '{}', skipping.", path);
+                warn!("File too large at path '{}', skipping.", path);
             }
             Err(_e) => {
-                info!("Failed to read file at path '{}', skipping.", path);
+                warn!("Failed to read file at path '{}', skipping.", path);
             }
         }
     }
@@ -352,7 +352,7 @@ fn delta_filter_cost(
 
 fn weighted_prefix_end(vectored_bow: &[(Vec<u8>, usize)], prefix_length: usize) -> usize {
     if prefix_length == 0 {
-        info!("Prefix length is 0, returning 0 for weighted prefix end.");
+        debug!("Prefix length is 0, returning 0 for weighted prefix end.");
         // This case shouldn't be seen
         return 0;
     }
@@ -363,7 +363,7 @@ fn weighted_prefix_end(vectored_bow: &[(Vec<u8>, usize)], prefix_length: usize)
             return idx + 1; //Enumerator is 0-based, so we need to add 1 to get the correct length of the prefix vector
         }
     }
-    info!("Warning: prefix_length {} is greater than total token count {}, returning full length of vectored_bow.", prefix_length, vectored_bow.len());
+    warn!("prefix_length {} is greater than total token count {}, returning full length of vectored_bow.", prefix_length, vectored_bow.len());
     vectored_bow.len()
 }
 
@@ -379,7 +379,7 @@ fn detect_clones(
     let word_matcher: Matcher = Matcher::words_matcher();
     let p_prefix = vector_of_indices.len();
     for (path, origin_word_count) in function_paths_and_lengths.values() {
-        info!("-----------------------------------------------------------------------------");
+        debug!("-----------------------------------------------------------------------------");
         // info!("Path: {}, Words: {}", path, origin_word_count);
         match load_file(path, 1024 * 1024 * 1024) {
             Ok(Ok(function_code)) => {
@@ -392,7 +392,7 @@ fn detect_clones(
                         .unwrap_or(usize::MAX)
                 });
                 let origin_function_id = blake3::hash(path.as_bytes());
-                info!("Origin path: {}", path);
+                debug!("Origin path: {}", path);
                 let mut candidate_map = CandidateMap::new();
 
                 let prefix_length = origin_word_count
@@ -432,7 +432,7 @@ fn detect_clones(
                                 continue; //skip candidates that have already been processed as origins
                             } */
                             if clone_map.contains_key(&candidate.0) {
-                                info!("DClone: SKIPPING candidate at path '{}' since it already has an entry in clone_map.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown"));
+                                debug!("DClone: SKIPPING candidate at path '{}' since it already has an entry in clone_map.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown"));
                                 continue;
                             }
                             let candidate_word_count = function_paths_and_lengths
@@ -479,8 +479,8 @@ fn detect_clones(
                     total_cost_vector.push(filter_cost + verification_cost);
 
                     if total_cost_vector[p] > total_cost_vector[p - 1] {
-                        info!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]);
-                        info!("The next prefix scheme {} has estimated total cost of {}, filter cost: {}, verification cost: {}.", p, total_cost_vector[p], filter_cost_vector[p], verification_cost_vector[p]);
+                        debug!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]);
+                        debug!("The next prefix scheme {} has estimated total cost of {}, filter cost: {}, verification cost: {}.", p, total_cost_vector[p], filter_cost_vector[p], verification_cost_vector[p]);
                         verify_candidates(
                             origin_function_id,
                             &origin_vectored_bow,
@@ -513,15 +513,15 @@ fn detect_clones(
                         }
                     }
                 }
-                info!("Filter cost vector: {:?}", filter_cost_vector);
-                info!("Verification cost vector: {:?}", verification_cost_vector);
-                info!("Total cost vector: {:?}", total_cost_vector);
+                debug!("Filter cost vector: {:?}", filter_cost_vector);
+                debug!("Verification cost vector: {:?}", verification_cost_vector);
+                debug!("Total cost vector: {:?}", total_cost_vector);
             }
             Ok(Err(_e)) => {
-                info!("Warning: File too large at path '{}', skipping.", path);
+                warn!("File too large at path '{}', skipping.", path);
             }
             Err(_e) => {
-                info!("Failed to read file at path '{}', skipping.", path);
+                warn!("Failed to read file at path '{}', skipping.", path);
             }
         }
     }
@@ -548,7 +548,7 @@ fn verify_candidates(
         .get(&origin_function_id)
         .copied()
         .unwrap_or(("Unknown", 0));
-    info!(
+    debug!(
         "Verifying candidates for function at path '{}', with word count {}.",
         origin_path, origin_word_count
     );
@@ -559,25 +559,25 @@ fn verify_candidates(
         .iter()
         .map(|(token, count)| (String::from_utf8_lossy(token).to_string(), *count))
         .collect();
-    info!("sorted origin vector: {:?}", origin_vector_readable);
-    info!("origin_function_id: {:?}", origin_function_id);
-    info!("origin_id as bytes: {:?}", origin_function_id.as_bytes());
+    debug!("sorted origin vector: {:?}", origin_vector_readable);
+    debug!("origin_function_id: {:?}", origin_function_id);
+    debug!("origin_id as bytes: {:?}", origin_function_id.as_bytes());
     for candidate_id in candidates_to_verify {
-        info!("----------------------");
+        debug!("----------------------");
         let (path, length) = function_paths_and_lengths
             .get(&candidate_id)
             .copied()
             .unwrap();
-        info!("candidate_id as bytes: {:?}", candidate_id.as_bytes());
+        debug!("candidate_id as bytes: {:?}", candidate_id.as_bytes());
         if clone_map.contains_key(&candidate_id) {
-            info!(
+            debug!(
                 "SKIPPING candidate at path '{}' since it already has an entry in clone_map.",
                 path
             );
             continue;
         }
         if candidate_id == origin_function_id {
-            info!("Skipping self-comparison for function at path '{}'.", path);
+            debug!("Skipping self-comparison for function at path '{}'.", path);
             continue; //skip comparing the function to itself
         }
         let mut origin_last_token_seen_pos = prefix_origin_last_token_seen_pos;
@@ -585,7 +585,7 @@ fn verify_candidates(
             Ok(Ok(candidate_code)) => {
                 // Handle successful file load
                 // load function, sort tokens by global frequency, calculate similarity, if above threshold add to clone map
-                info!("Candidate loaded: {}, length: {}", path, length);
+                debug!("Candidate loaded: {}, length: {}", path, length);
                 let candidate_bow = word_matcher.bag_of_words(&candidate_code.to_ascii_lowercase());
                 let mut vectored_candidate_bow = candidate_bow.vectorize();
                 vectored_candidate_bow.sort_by_key(|(token, _)| {
@@ -598,7 +598,7 @@ fn verify_candidates(
                     .iter()
                     .map(|(token, count)| (String::from_utf8_lossy(token).to_string(), *count))
                     .collect();
-                info!("sorted candidate vector: {:?}", candidate_vector_readable);
+                debug!("sorted candidate vector: {:?}", candidate_vector_readable);
                 let candidate_word_count = length;
                 let candidate_token_count = vectored_candidate_bow.len();
                 let current_threshold = (max(origin_word_count, candidate_word_count) as f64
@@ -620,12 +620,12 @@ fn verify_candidates(
                     let candidate_token_tuple =
                         &vectored_candidate_bow[candidate_last_token_seen_pos.0];
 
-                    info!("Current threshold: {}", current_threshold);
-                    info!(
+                    debug!("Current threshold: {}", current_threshold);
+                    debug!(
                         "Current matches: {} + {} = {}",
                         prefix_matches, new_matches, current_matches
                     );
-                    info!("Upper bound of remaining matches: {}", upper_bound);
+                    debug!("Upper bound of remaining matches: {}", upper_bound);
 
                     let origin_rank = token_rankings
                         .get(&origin_token_tuple.0)
@@ -636,7 +636,7 @@ fn verify_candidates(
                         .map(|(_, rank)| *rank)
                         .unwrap_or(usize::MAX);
 
-                    info!(
+                    debug!(
                         "Origin: {}, rank: {}, position: {} | Candidate: {}, rank: {}, position: {}",
                         String::from_utf8_lossy(&origin_token_tuple.0),
                         origin_rank,
@@ -648,7 +648,7 @@ fn verify_candidates(
 
                     if current_matches >= current_threshold {
                         //already reached the threshold, we can stop comparing this candidate and add it to the clone map
-                        info!(
+                        debug!(
                             "Threshold reached with current matches {}, adding to clone map.",
                             current_matches
                         );
@@ -656,7 +656,7 @@ fn verify_candidates(
                     } else if upper_bound + current_matches >= current_threshold {
                         if origin_token_tuple.0 == candidate_token_tuple.0 {
                             //it's a match
-                            info!("MATCHING!");
+                            debug!("MATCHING!");
                             new_matches += min(origin_token_tuple.1, candidate_token_tuple.1);
                             candidate_last_token_seen_pos.0 += 1;
                             candidate_last_token_seen_pos.1 += candidate_token_tuple.1;
@@ -664,18 +664,18 @@ fn verify_candidates(
                             origin_last_token_seen_pos.1 += origin_token_tuple.1;
                         } else if origin_rank > candidate_rank {
                             //origin token is more frequent than candidate token, so we move in the candidate vector
-                            info!("origin_count > candidate_count");
+                            debug!("origin_count > candidate_count");
                             candidate_last_token_seen_pos.0 += 1;
                             candidate_last_token_seen_pos.1 += origin_token_tuple.1;
                         } else {
                             //candidate token is more frequent than origin token, so we move in the origin vector
-                            info!("candidate_count > origin_count");
+                            debug!("candidate_count > origin_count");
                             origin_last_token_seen_pos.0 += 1;
                             origin_last_token_seen_pos.1 += candidate_token_tuple.1;
                         }
                     } else {
-                        info!("UPPER BOUND + current_matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound + current_matches, current_threshold);
-                        info!(
+                        debug!("UPPER BOUND + current_matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound + current_matches, current_threshold);
+                        debug!(
                             "origin_last_token_seen_pos: {}, candidate_last_token_seen_pos: {}",
                             origin_last_token_seen_pos.0, candidate_last_token_seen_pos.0
                         );
@@ -690,8 +690,8 @@ fn verify_candidates(
                 );
                 if candidate_map.get_token_matches(&candidate_id) >= current_threshold {
                     insert_clone_relation(clone_map, origin_function_id, candidate_id);
-                    info!("*** CLONE DETECTED! ***");
-                    info!(
+                    debug!("*** CLONE DETECTED! ***");
+                    debug!(
                         "Origin: {}, Candidate: {}, Similarity >= {:.2} %",
                         function_paths_and_lengths
                             .get(&origin_function_id)
@@ -706,13 +706,13 @@ fn verify_candidates(
                             * 100.0
                     );
                 }
-                info!("**********")
+                debug!("**********")
             }
             Ok(Err(_)) => {
-                info!("Warning: File too large at path '{}', skipping.", path);
+                warn!("File too large at path '{}', skipping.", path);
             }
             Err(_) => {
-                info!("Failed to read file at path '{}', skipping.", path);
+                warn!("Failed to read file at path '{}', skipping.", path);
             }
         }
     }

From b0777c9e6da1cf12710cbb49e1c97f65eb4bc6ab Mon Sep 17 00:00:00 2001
From: swartling <holger.swartling@gmail.com>
Date: Wed, 6 May 2026 14:22:35 +0200
Subject: [PATCH 14/14] probably final implementation

---
 src/bin/main.rs                               |   2 +
 src/phases/parse.rs                           |   2 -
 src/phases/tokenizer.rs                       |  27 ++-
 src/phases/type_3_duplicate_files.rs          | 190 +++++++++++++++---
 .../files/java_examples/CB6/factorial.java    |   4 +
 .../identical_functions.csv                   |   3 +
 6 files changed, 191 insertions(+), 37 deletions(-)
 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java
 create mode 100644 tests/data/phases/type_3_duplicate_files/identical_functions.csv

diff --git a/src/bin/main.rs b/src/bin/main.rs
index 715e788..bad656f 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -272,6 +272,8 @@ fn main() {
                                     *cli_subargs.get_one::<usize>("p_prefix").unwrap(),
                                     *cli_subargs.get_one::<f64>("threshold").unwrap(),
                                     cli_subargs.get_one::<String>("example_word"),
+                                    cli_subargs.get_flag("force"),
+                                    cli_subargs.get_one::<String>("header").unwrap(),
                                     &logger,
                                 )
                             }
diff --git a/src/phases/parse.rs b/src/phases/parse.rs
index cabaf25..8ed9661 100644
--- a/src/phases/parse.rs
+++ b/src/phases/parse.rs
@@ -455,7 +455,6 @@ fn analyze_file(
     ignore_comments: bool,
     word_counter: &Matcher,
 ) -> Result<(String, Option<String>)> {
-    info!("analyze_file called with path: {path}");
     let grammar = language_to_grammar(language)
         .with_context(|| format!("Unsupported language: {language}"))?;
     // Initializes the parser
@@ -463,7 +462,6 @@ fn analyze_file(
     parser.set_language(&grammar.lang)?;
     match load_file(path, 1024 * 1024 * 1024)? {
         Ok(source_code) => {
-            info!("File {path} loaded successfully");
             // Creates a folder to store the functions of the file
             let target_folder: String = format!("{path}.functions");
             create_dir(&target_folder)?;
diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs
index 684adda..c7b46bf 100644
--- a/src/phases/tokenizer.rs
+++ b/src/phases/tokenizer.rs
@@ -149,15 +149,24 @@ pub fn run(input_path: &str, example_word: &str, _logger: &Logger) -> Result<()>
 } */
 
 pub fn global_counter(input_file: &DataFrame) -> Result<Bow> {
+    use indicatif::ProgressBar;
+    use std::time::Instant;
+    info!("Building global Bag of Words from the functions in the input file...");
+    let bow_start = Instant::now();
+    let paths_column = input_file.column("path").and_then(|c| c.str())?;
+    let total_files = paths_column.len();
+
+    let bow_progress = ProgressBar::new(total_files as u64);
+    bow_progress.set_style(
+        indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%")?,
+    );
+    bow_progress.set_message("Building global bag-of-words...");
+
     let word_matcher: Matcher = Matcher::words_matcher();
     let mut global_bow: Bow = Bow::new();
 
-    for row in input_file
-        .column("path")
-        .and_then(|c| c.str())
-        .unwrap()
-        .into_iter()
-    {
+    for row in paths_column.into_iter() {
+        bow_progress.inc(1);
         match row {
             Some(path) => {
                 //let function_code = std::fs::read_to_string(path)?;
@@ -168,7 +177,7 @@ pub fn global_counter(input_file: &DataFrame) -> Result<Bow> {
                         global_bow.merge(local_bow);
                     }
                     Ok(Err(_e)) => {
-                        info!("  Warning: File to large at path {}", path);
+                        info!("  Warning: File too large at path {}", path);
                     }
                     Err(_e) => {
                         info!("  Warning: Could not load file at path {}", path);
@@ -181,6 +190,10 @@ pub fn global_counter(input_file: &DataFrame) -> Result<Bow> {
         }
     }
 
+    bow_progress.finish_and_clear();
+    let bow_duration = bow_start.elapsed();
+    info!("BOW building took: {:.2}s", bow_duration.as_secs_f64());
+
     Ok(global_bow)
 }
 
diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs
index b3337cc..59d1bf8 100644
--- a/src/phases/type_3_duplicate_files.rs
+++ b/src/phases/type_3_duplicate_files.rs
@@ -2,16 +2,19 @@ use crate::phases::tokenizer::global_counter;
 use crate::utils::candidate_map::*;
 use crate::utils::fs::*;
 use crate::utils::inverted_index::*;
-use crate::utils::logger::Logger;
+use crate::utils::logger::{log_output_file, log_write_output, Logger};
 use crate::utils::regex::*;
 use anyhow::{/* Error,  */ Result};
 use blake3;
+use clap::ArgAction;
 use clap::{Arg, Command};
 use core::f64;
 use either::Either;
+use indicatif::ProgressBar;
 use polars::prelude::*;
 use std::cmp::{max, min};
 use std::collections::{HashMap, HashSet};
+use std::time::Instant;
 use std::vec;
 use tracing::{debug, info, warn};
 
@@ -60,7 +63,7 @@ pub fn cli() -> Command {
         .arg(
             Arg::new("threads")
                 .short('n')
-                .help("Number of threads to use, default is 1.")
+                .help("Number of threads to use, default is 1.  CURRENT VERSION IS SINGLE THREADED")
                 .default_value("1")
                 .value_parser(clap::value_parser!(usize))
         )
@@ -87,6 +90,20 @@ pub fn cli() -> Command {
                 .help("An example word to check the global Bag of Words for.")
                 .required(false),
         )
+        .arg(
+            Arg::new("force")
+                .short('f')
+                .long("force")
+                .help("Override the output CSV file if it already exists.")
+                .default_value("false")
+                .action(ArgAction::SetTrue)
+        )
+        .arg(
+            Arg::new("header")
+                .long("header")
+                .help("Name of column storing file paths in the input CSV file.")
+                .default_value("path"),
+        )
 }
 
 pub fn run(
@@ -100,11 +117,14 @@ pub fn run(
     p_prefix: usize,               //number of tokens to consider for the prefix, default is 1
     threshold: f64,                //threshold for the prefix length, default is 0.8
     example_word: Option<&String>, //an example word to check the global Bag of Words for, optional
+    force: bool,                   //whether to override the output CSV file if it already exists
+    input_header: &str,            //name of column storing file paths in the input CSV file
     _logger: &Logger,
 ) -> Result<()> {
     //let language = "java";
     let language = opt_language.unwrap_or("java"); //default to java currently
-    let minimum_loc = 2; //temporary
+    let minimum_loc = 0; //temporary
+    let total_start = Instant::now();
     let mut input_file = open_csv(
         input_path,
         Some(Schema::from_iter(vec![
@@ -156,18 +176,8 @@ pub fn run(
 
     let n_functions_after_loc = input_file.height();
 
-    info!(
-        "{} functions found after filtering  ({:.2} %)", //something is weird with the percentage calculation here.
-        n_functions_after_loc,
-        if n_functions_before_loc == 0 {
-            0
-        } else {
-            (n_functions_after_loc as f64 / n_functions_before_loc as f64 * 100.0) as usize
-        }
-    );
-
     //moved here from detect_clones
-    let paths_column = input_file.column("path")?.str()?;
+    let paths_column = input_file.column(input_header)?.str()?;
     let words_column = input_file.column("words")?.u32()?;
     let rows: Vec<(&str, usize)> = paths_column
         .into_iter()
@@ -185,6 +195,7 @@ pub fn run(
 
     let global_bow = global_counter(&input_file)?;
     let token_rankings = global_bow.token_rankings();
+    let index_start = Instant::now();
     let vector_of_indices_plus_min_max = index_builder(
         &input_file,
         &token_rankings,
@@ -192,6 +203,8 @@ pub fn run(
         threshold,
         &function_paths_and_lengths,
     )?;
+    let index_duration = index_start.elapsed();
+    info!("Index building took: {:.2}s", index_duration.as_secs_f64());
     // Maximum and minimum 'words' in input file
     let vector_of_indices = vector_of_indices_plus_min_max.0;
     let min_words = vector_of_indices_plus_min_max.1 .0;
@@ -238,16 +251,110 @@ pub fn run(
     }
 
     //go through input file again? Means i can grab 'words' from the file. Could do something like just checking candidates
+    let verify_start = Instant::now();
     let clone_map = detect_clones(
         &token_rankings,
         &vector_of_indices,
         threshold,
         &function_paths_and_lengths,
     )?;
+    let verify_duration = verify_start.elapsed();
+    info!("Verification took: {:.2}s", verify_duration.as_secs_f64());
+
+    // Ok() is kept at the end of the function
+    // Prepare CSV outputs: map of clone -> origin, and unique files list
+    let default_output_path: String = format!("{input_path}.unique.csv");
+    let default_map_path: String = format!("{input_path}.duplicates_map.csv");
+    let output_path: &str = _output_path.unwrap_or(&default_output_path);
+    let map_path: &str = _map_path.unwrap_or(&default_map_path);
+
+    // Build clone -> origin mapping
+    let mut clone_to_origin: HashMap<blake3::Hash, blake3::Hash> = HashMap::new();
+    for (k, v) in clone_map.iter() {
+        match v {
+            Either::Left(set) => {
+                for c in set.iter() {
+                    clone_to_origin.insert(*c, *k);
+                }
+            }
+            Either::Right(origin) => {
+                clone_to_origin.insert(*k, *origin);
+            }
+        }
+    }
+
+    // Map CSV rows
+    let mut map_names: Vec<String> = Vec::new();
+    let mut map_originals: Vec<String> = Vec::new();
+    for (clone_hash, origin_hash) in clone_to_origin.iter() {
+        let clone_path = function_paths_and_lengths
+            .get(clone_hash)
+            .map(|(p, _)| *p)
+            .unwrap_or("Unknown")
+            .to_string();
+        let origin_path = function_paths_and_lengths
+            .get(origin_hash)
+            .map(|(p, _)| *p)
+            .unwrap_or("Unknown")
+            .to_string();
+        map_names.push(clone_path);
+        map_originals.push(origin_path);
+    }
+
+    let mut map_df = DataFrame::new(vec![
+        polars::prelude::Column::new("name".into(), map_names),
+        polars::prelude::Column::new("original".into(), map_originals),
+    ])?;
+
+    // Unique files: those that are not listed as clones
+    let mut unique_paths: Vec<String> = Vec::new();
+    let mut unique_words: Vec<u32> = Vec::new();
+    for (hash, (path, words)) in function_paths_and_lengths.iter() {
+        if !clone_to_origin.contains_key(hash) {
+            unique_paths.push(path.to_string());
+            unique_words.push(*words as u32);
+        }
+    }
+
+    let mut output_df = DataFrame::new(vec![
+        polars::prelude::Column::new("path".into(), unique_paths),
+        polars::prelude::Column::new("words".into(), unique_words),
+    ])?;
+
+    // Check output files and write
+    log_output_file(output_path, false, force)?;
+    log_write_output(_logger, map_path, &mut map_df, false)?;
+    log_write_output(_logger, output_path, &mut output_df, false)?;
+
     info!(
-        "Finished detecting clones. {} clones found.",
-        clone_map.len()
+        "Remaining files: {} / {:.2} %", //something is weird with the percentage calculation here.
+        n_functions_after_loc,
+        if n_functions_before_loc == 0 {
+            0
+        } else {
+            (n_functions_after_loc as f64 / n_functions_before_loc as f64 * 100.0) as usize
+        }
     );
+
+    let unique_files = output_df.height();
+    let unique_file_percentage = (unique_files as f64 / n_functions_after_loc as f64) * 100.0;
+
+    info!(
+        "Unique files: {} / {:.2} %",
+        unique_files, unique_file_percentage
+    );
+
+    let duplicate_files = n_functions_after_loc - unique_files;
+    let duplicate_file_percentage = (duplicate_files as f64 / n_functions_after_loc as f64) * 100.0;
+
+    info!(
+        "Duplicate files: {} / {:.2} %",
+        duplicate_files, duplicate_file_percentage
+    );
+
+    let total_duration = total_start.elapsed();
+    info!("Total runtime: {:.2}s", total_duration.as_secs_f64());
+
     Ok(())
 }
 
@@ -258,6 +365,11 @@ fn index_builder(
     threshold: f64,
     function_paths_and_lengths: &HashMap<blake3::Hash, (&str, usize)>,
 ) -> Result<(Vec<InvertedIndex>, (usize, usize))> {
+    info!("Building indices...");
+    let index_progress = ProgressBar::new(function_paths_and_lengths.len() as u64);
+    index_progress.set_style(
+        indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%")?,
+    );
     let word_matcher: Matcher = Matcher::words_matcher();
 
     let mut vector_of_indices: Vec<InvertedIndex> = Vec::new();
@@ -274,6 +386,7 @@ fn index_builder(
         .into_iter()
         .flatten()
     {
+        index_progress.inc(1);
         match load_file(path, 1024 * 1024 * 1024) {
             Ok(Ok(function_code)) => {
                 let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase());
@@ -326,6 +439,7 @@ fn index_builder(
             }
         }
     }
+    index_progress.finish();
     info!("Finished building indices.");
     Ok((vector_of_indices, (min_words, max_words)))
 }
@@ -378,8 +492,15 @@ fn detect_clones(
 
     let word_matcher: Matcher = Matcher::words_matcher();
     let p_prefix = vector_of_indices.len();
+    info!("Detecting clones");
+    let detection_progress = ProgressBar::new(function_paths_and_lengths.len() as u64);
+    detection_progress.set_style(
+        indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%")?,
+    );
+
     for (path, origin_word_count) in function_paths_and_lengths.values() {
         debug!("-----------------------------------------------------------------------------");
+        detection_progress.inc(1);
         // info!("Path: {}, Words: {}", path, origin_word_count);
         match load_file(path, 1024 * 1024 * 1024) {
             Ok(Ok(function_code)) => {
@@ -398,7 +519,10 @@ fn detect_clones(
                 let prefix_length = origin_word_count
                     - ((*origin_word_count as f64) * threshold).round() as usize
                     + 1;
-
+                debug!(
+                    "Prefix length: {}, origin word count: {}, threshold: {}",
+                    prefix_length, origin_word_count, threshold
+                );
                 let init_prefix_end = weighted_prefix_end(&origin_vectored_bow, prefix_length);
                 let mut filter_cost_vector: Vec<usize> = Vec::new();
                 filter_cost_vector.push(0); //cost of prefix scheme 1 is calculated from an empty prefix, so the initial cost is 0
@@ -419,14 +543,17 @@ fn detect_clones(
                         && origin_token_position < origin_vectored_bow.len()
                     {
                         let token_tuple = origin_vectored_bow.get(origin_token_position).unwrap();
+                        //debug!("Processing token {} at position {} in the origin prefix vector for prefix scheme {}.", String::from_utf8_lossy(&token_tuple.0), origin_token_position, p);
                         //loop through the prefix vector of the current scheme, for the first scheme this is just the original prefix vector, for the next schemes this includes additional tokens
                         let is_new = origin_token_position + 1 == prefix_end;
                         origin_cumulative_count += token_tuple.1;
                         filter_cost += delta_filter_cost(token_tuple, vector_of_indices, p, is_new);
+                        debug!("Origin    ID {}, count {}, token {}, index {}, tok_pos {}, wordpos {}.", origin_function_id, token_tuple.1, String::from_utf8_lossy(&token_tuple.0), p, origin_token_position, origin_cumulative_count);
                         for candidate in vector_of_indices[p - 1]
                             .get(&token_tuple.0)
                             .unwrap_or(&Vec::new())
                         {
+                            debug!("Candidate ID {}, count {}, token {}, index {}, tok_pos {}, wordpos {}.", candidate.0, candidate.1, String::from_utf8_lossy(&token_tuple.0), p, candidate.2.0, candidate.2.1);
                             /* if candidate_id_lt_origin_id(&candidate.0, &origin_function_id) {
                                 info!("DClone: SKIPPING candidate at path '{}' since it has a lower function ID than the origin.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown"));
                                 continue; //skip candidates that have already been processed as origins
@@ -443,6 +570,7 @@ fn detect_clones(
                             if candidate_word_count
                                 < ((*origin_word_count as f64) * threshold).round() as usize
                             {
+                                debug!("DClone: SKIPPING candidate at path '{}' since its word count {} is below the threshold for clones with the origin ({} words, threshold {}).", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown"), candidate_word_count, origin_word_count, threshold);
                                 continue; //skip candidates that are too small to reach the threshold
                             }
 
@@ -454,9 +582,11 @@ fn detect_clones(
                                     .round() as usize;
                             let upper_bound = min(
                                 *origin_word_count - origin_cumulative_count,
-                                candidate_word_count - last_token_seen_pos.1 + new_matches, //candidate.2.1 is the number of words seen up to and including this token including duplicates
+                                candidate_word_count - last_token_seen_pos.1, //candidate.2.1 is the number of words seen up to and including this token including duplicates
                             );
-                            if candidate_map.get_token_matches(&function_id) + upper_bound
+                            if candidate_map.get_token_matches(&function_id)
+                                + upper_bound
+                                + new_matches
                                 >= current_threshold
                             {
                                 candidate_map.add_pending_update(
@@ -487,7 +617,7 @@ fn detect_clones(
                             (origin_token_position, origin_cumulative_count),
                             &mut candidate_map,
                             &mut clone_map,
-                            p_prefix,
+                            p - 1,
                             token_rankings,
                             threshold,
                             function_paths_and_lengths,
@@ -497,6 +627,7 @@ fn detect_clones(
                         //apply updates
                         candidate_map.apply_pending_updates(function_paths_and_lengths);
                         if p == p_prefix {
+                            debug!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. This is the last prefix scheme, moving on to verification phase.", p, total_cost_vector[p], filter_cost_vector[p], verification_cost_vector[p]);
                             //return verify_candidates(candidate_map, path, function_code, p);
                             verify_candidates(
                                 origin_function_id,
@@ -525,7 +656,7 @@ fn detect_clones(
             }
         }
     }
-
+    detection_progress.finish();
     Ok(clone_map)
 }
 
@@ -609,7 +740,7 @@ fn verify_candidates(
                 let mut new_matches = 0usize;
                 let prefix_matches = candidate_map.get_token_matches(&candidate_id);
                 while origin_last_token_seen_pos.0 < origin_token_count
-                    && candidate_last_token_seen_pos.0 < candidate_token_count
+                    && candidate_last_token_seen_pos.0 + 1 < candidate_token_count
                 {
                     let upper_bound = min(
                         origin_word_count - origin_last_token_seen_pos.1,
@@ -618,7 +749,12 @@ fn verify_candidates(
                     let current_matches = prefix_matches + new_matches;
                     let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0];
                     let candidate_token_tuple =
-                        &vectored_candidate_bow[candidate_last_token_seen_pos.0];
+                        &vectored_candidate_bow[candidate_last_token_seen_pos.0 + 1];
+
+                    let candidate_current_token_pos = (
+                        candidate_last_token_seen_pos.0 + 1,
+                        candidate_last_token_seen_pos.1 + candidate_token_tuple.1,
+                    );
 
                     debug!("Current threshold: {}", current_threshold);
                     debug!(
@@ -643,7 +779,7 @@ fn verify_candidates(
                         origin_last_token_seen_pos.0,
                         String::from_utf8_lossy(&candidate_token_tuple.0),
                         candidate_rank,
-                        candidate_last_token_seen_pos.0
+                        candidate_current_token_pos.0
                     );
 
                     if current_matches >= current_threshold {
@@ -658,15 +794,13 @@ fn verify_candidates(
                             //it's a match
                             debug!("MATCHING!");
                             new_matches += min(origin_token_tuple.1, candidate_token_tuple.1);
-                            candidate_last_token_seen_pos.0 += 1;
-                            candidate_last_token_seen_pos.1 += candidate_token_tuple.1;
+                            candidate_last_token_seen_pos = candidate_current_token_pos;
                             origin_last_token_seen_pos.0 += 1;
                             origin_last_token_seen_pos.1 += origin_token_tuple.1;
                         } else if origin_rank > candidate_rank {
                             //origin token is more frequent than candidate token, so we move in the candidate vector
                             debug!("origin_count > candidate_count");
-                            candidate_last_token_seen_pos.0 += 1;
-                            candidate_last_token_seen_pos.1 += origin_token_tuple.1;
+                            candidate_last_token_seen_pos = candidate_current_token_pos;
                         } else {
                             //candidate token is more frequent than origin token, so we move in the origin vector
                             debug!("candidate_count > origin_count");
diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java
new file mode 100644
index 0000000..71d2783
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java
@@ -0,0 +1,4 @@
+//Code Block 1 (CB1)
+public static int factorial(int result) {
+  if(result <= 1) return 1;
+  return result * factorial(result-1); }
\ No newline at end of file
diff --git a/tests/data/phases/type_3_duplicate_files/identical_functions.csv b/tests/data/phases/type_3_duplicate_files/identical_functions.csv
new file mode 100644
index 0000000..3e9eee0
--- /dev/null
+++ b/tests/data/phases/type_3_duplicate_files/identical_functions.csv
@@ -0,0 +1,3 @@
+id,path,name,position,language,loc,words,tests/data/phases/type_3_duplicate_files/java.json,loop_statements,loop_nestings,if_statements,if_nestings,functions_calls,function_calls_nestings,params,param_kw_match,parse_error
+0,tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1,factorial,2:1,java,3,16,16,0,0,1,1,1,1,1,1,none
+1,tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java.functions/2-1,factorial,2:1,java,3,16,16,0,0,1,1,1,1,1,1,none
\ No newline at end of file