From 8cdd0a9821a46c463728a6565ad1c2688fdb209f Mon Sep 17 00:00:00 2001 From: swartling Date: Tue, 3 Mar 2026 14:05:45 +0100 Subject: [PATCH 01/14] alternate parser --- .gitignore | 8 + src/bin/main.rs | 20 +- src/phases/alt_parse.rs | 1330 +++++++++++++++++++++++++++++++++++++++ src/phases/mod.rs | 2 + 4 files changed, 1359 insertions(+), 1 deletion(-) create mode 100644 src/phases/alt_parse.rs diff --git a/.gitignore b/.gitignore index 7959c06..2733b64 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,14 @@ data/ ### Scyros ### ghtokens.csv !tests/data +outputs/ *.zip +// Negation pattern in .gitignore +// The '!' prefix in .gitignore is used to negate a previous pattern. +// It tells Git to NOT ignore files matching that pattern, even if they +// would have been ignored by an earlier rule. +// Example: If '*.tar.gz' ignores all .tar.gz files, then '!tests/data' +// would un-ignore the 'tests/data' file/directory, ensuring it is tracked +// by Git despite matching a previous ignore rule. *.tar.gz \ No newline at end of file diff --git a/src/bin/main.rs b/src/bin/main.rs index d736477..ea61c47 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -14,7 +14,7 @@ use clap::{Arg, ArgAction, Command}; use scyros::phases::{ - download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages, + alt_parse, download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages, filter_metadata, forks, ids, languages, metadata, parse, pull_request, }; use scyros::utils::error::*; @@ -36,6 +36,7 @@ fn cli() -> Command { .subcommand(duplicate_files::cli()) .subcommand(parse::cli()) .subcommand(extract_benchmarks::cli()) + .subcommand(alt_parse::cli()) .arg( Arg::new("debug") .long("debug") @@ -213,6 +214,23 @@ fn main() { &mut logger, ) } + else if subcommand == alt_parse::cli().get_name() { + alt_parse::run( + cli_subargs.get_one::("input").unwrap(), + cli_subargs.get_one::("output").map(|x| x.as_str()), + cli_subargs.get_one::("logs").map(|x| x.as_str()), + cli_subargs + .get_many::("lang") + .map(|v| + v.map(|s| s.as_str()) + .collect::>()), + cli_subargs.get_one::("failures").unwrap(), + *cli_subargs.get_one::("threads").unwrap(), + *cli_subargs.get_one::("seed").unwrap(), + cli_subargs.get_flag("force"), + &mut logger, + ) + } else { Error::new(&format!("The subcommand {} is not available. Run the program with the --help flag to see the list of subcommands", subcommand)).to_res() } diff --git a/src/phases/alt_parse.rs b/src/phases/alt_parse.rs new file mode 100644 index 0000000..ed5f6dc --- /dev/null +++ b/src/phases/alt_parse.rs @@ -0,0 +1,1330 @@ +// Copyright 2025 Andrea Gilot +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Parse all the files in the input file and extract the functions whose body contains one of the provided keywords. +//! All parsed files repositories are logged in a CSV file where statistics about the functions are stored. +//! These statistics include the number of lines of code, the number of words, the number of keywords matched, the number of conditional statements, loops, +//! and the maximum nesting level of these statements. +//! The name of the log file is the same as the input file with the extension `.functions`. +//! The functions are stored in a folder with the same name as the file and the extension `_functions`. +//! The supported languages are C, C++, Java, Python and Fortran. + +use clap::ArgAction; +use clap::{Arg, Command}; +use indicatif::ProgressBar; +use polars::prelude::*; +use rand::rngs::StdRng; +use rand::seq::SliceRandom as _; +use rand::SeedableRng; + +use std::iter::FromIterator as _; +use std::vec; +use std::{collections::HashSet, fmt::Write, io::Write as IOWrite, sync::Mutex}; +use tree_sitter::{Language, Node, Parser, Tree}; + +use crate::utils::error::*; +use crate::utils::fs::*; +use crate::utils::regex::*; +use crate::utils::{csv::*, logger::Logger}; + +/// Command line arguments parsing. +pub fn cli() -> Command { + Command::new("alt_parse") + .about("Parse all the files in the dataset and extract functions") + .long_about( + "Parse all the files in the input file and extract functions \ + All parsed files repositories are logged in a CSV file where statistics about the functions are stored. \ + These statistics include the number of lines of code, the number of words, the number of keywords matched, the number of conditional statements, loops, + and the maximum nesting level of these statements.\n\ + The name of the log file is the same as the input file with the extension \".functions\". \ + The functions are stored in a folder with the same name as the file and the extension \"_functions\".\n\ + The supported languages are C, C++, Java, Python and Fortran." + ) + .disable_version_flag(true) + .arg( + Arg::new("input") + .short('i') + .long("input") + .value_name("INPUT_FILE.csv") + .help("Path to the input csv file to use. It must be a valid CSV file where the first column is the path to the file and the \ + second column is the extension of the file. Other columns are ignored.") + .required(true) + ) + .arg( + Arg::new("output") + .short('o') + .long("output") + .value_name("OUTPUT_FILE.csv") + .help("Path to the output csv file storing the functions statistics.") + .required(false), + ) + .arg( + Arg::new("logs") + .short('l') + .long("logs") + .value_name("LOGS_FOLDER") + .help("Path to the folder where the logs are stored. The default is the current folder.") + .required(false), + ) +/* .arg( + Arg::new("keywords") + .short('k') + .long("keywords") + .num_args(1..) + .action(ArgAction::Append) + .value_name("KEYWORDS_FILES.json") + .help("List of files containing the list of extensions and keywords to use. The files must be in JSON format.\n\ + The extensions should be written without the period (`java` instead of `.java`). The files must have the following structure:\n \ + {\n \ + \"extensions\": {\n \ + \"ext1\": [\"kw11\", \"kw12\", ...],\n \ + \"ext2\": [\"kw21\", \"kw22\", ...],\n \ + ...\n \ + },\n \ + \"keywords\": [\"kw1\", \"kw2\", ...]\n \ + }") + .required(true) + ) */ + .arg( + Arg::new("lang") + .long("lang") + .num_args(1..) + .action(ArgAction::Append) + .value_name("LANGUAGES") + .help("List of languages to parse. The supported languages are C, C++, C#, Fortran, Go, Java, Python and Typescript.") + .required(false) + ) + .arg( + Arg::new("force") + .short('f') + .long("force") + .help("Override the output file if it already exists.") + .default_value("false") + .action(ArgAction::SetTrue), + ) + .arg( + Arg::new("threads") + .short('n') + .help("Number of threads to use.") + .default_value("1") + .value_parser(clap::value_parser!(usize)) + ) + .arg( + Arg::new("seed") + .short('s') + .long("seed") + .value_name("SEED") + .help("Seed used to randomly shuffle the input file.") + .default_value("8155495201244430235") + .value_parser(clap::value_parser!(u64)), + ) + .arg( + Arg::new("failures") + .long("failures") + .value_name("POLICY") + .help("Failure policy when a file or a function has a parsing error.\n\ + ignore: continue parsing\n\ + skip-file: replace the file statistics with an error row in the output file, does not extract any function from the file\n\ + skip-function: replace the function statistics with an error row in the output file\n\ + abort: stop the program") + .default_value("ignore") + .value_parser(["ignore", "skip-file", "skip-function", "abort"]), + ) +} + +/// Simplified version that extracts all functions without caring about keywords or complex metrics. +/// This version only extracts function definitions and basic statistics. +/// +/// # Arguments +/// +/// * `input_file` - Path to the input csv file to use. +/// * `output_file` - Path to the output csv file storing the functions statistics. +/// * `logs_file` - Path to the output csv file storing the files statistics. +/// * `threads` - Number of threads to use. +/// +/// All functions are extracted and logged in a CSV file with basic statistics only: +/// id, path, name, position, language, loc, words, params, parse_error +/// +pub fn run( + input_path: &str, + output_path: Option<&str>, + logs_path: Option<&str>, + opt_languages: Option>, + fail_policy: &str, + threads: usize, + seed: u64, + force: bool, + logger: &mut Logger, +) -> Result<(), Error> { + let supported_languages: HashSet<&'static str> = vec![ + "c", + "c++", + "c#", + "java", + "python", + "fortran", + "typescript", + "go", + "scala", + ] + .into_iter() + .collect::>(); + + let languages: Vec<&str> = match opt_languages { + Some(l) => { + for lang in l.iter() { + if !supported_languages.contains(lang) { + Error::new(&format!("Unsupported language: {}", lang)).to_res()?; + } + } + l + } + None => { + logger.log("No language specified, using all supported languages")?; + supported_languages.into_iter().collect() + } + }; + + let languages_series = Series::new( + "language_filter".into(), + languages + .iter() + .map(|x| x.to_string()) + .collect::>(), + ); + + let default_output_path: String = format!("{}.functions_simple.csv", input_path); + let output_path: &str = output_path.unwrap_or(&default_output_path); + + match check_path(output_path) { + Ok(_) => { + if force { + logger.log(&format!("Overriding existing file: {}", output_path))?; + } else { + Error::new(&format!( + "File {} already exists. Use --force to override it.", + output_path + )) + .to_res()?; + } + } + Err(_) => logger.log(&format!("Creating new file: {}", output_path))?, + } + + let default_logs_path: String = format!("{}.function_logs_simple.csv", input_path); + let logs_path: &str = logs_path.unwrap_or(&default_logs_path); + + match check_path(logs_path) { + Ok(_) => { + if force { + logger.log(&format!("Overriding existing file: {}", logs_path))?; + } else { + Error::new(&format!( + "File {} already exists. Use --force to override it.", + logs_path + )) + .to_res()?; + } + } + Err(_) => logger.log(&format!("Creating new file: {}", logs_path))?, + } + + let mut input_file = open_csv( + input_path, + Some(Schema::from_iter(vec![ + Field::new("id".into(), DataType::UInt32), + Field::new("name".into(), DataType::String), + Field::new("language".into(), DataType::String), + ])), + Some(vec!["id", "name", "language"]), + )?; + + let n_files_before = input_file.height(); + + logger.log(&format!( + " {} files found in the input file, filtering by selected languages", + n_files_before + ))?; + + input_file = map_err( + input_file + .lazy() + .filter(col("language").is_in(lit(languages_series))) + .collect(), + "Error filtering languages", + )?; + + let n_files = input_file.height(); + + logger.log(&format!( + " {} files found after filtering ({:.2} %)", + n_files, + if n_files_before == 0 { + 0 + } else { + n_files / n_files_before * 100 + } + ))?; + + logger.log_seed(seed)?; + + let mut shuffled_idx = (0..input_file.height()).collect::>(); + + logger.log_completion("Loading files in random order", || { + let mut rng: StdRng = SeedableRng::seed_from_u64(seed); + shuffled_idx.shuffle(&mut rng); + Ok(()) + })?; + + let shuffled_rows = shuffled_idx.into_iter().map(|idx| { + let row = input_file.get_row(idx).unwrap().0; + match (row[0].clone(), row[1].clone(), row[2].clone()) { + (AnyValue::UInt32(id), AnyValue::String(path), AnyValue::String(lang)) => Ok(( + id, + path.replace("-was_comma-", ",") + .replace("-was_quote-", "\""), + lang, + )), + _ => Err(idx), + } + }); + + const OUTPUT_COLS: usize = 9; + const LOGS_COLS: usize = 5; + + let word_counter: Matcher = Matcher::words_matcher(); + + let mut output_file = CSVFile::new(output_path, FileMode::Overwrite)?; + + let header: [&str; OUTPUT_COLS] = [ + "id", + "path", + "name", + "position", + "language", + "loc", + "words", + "params", + "parse_error", + ]; + + output_file.write_header(&header)?; + + let mut logs_file = CSVFile::new(logs_path, FileMode::Overwrite)?; + + let logs_header: [&str; LOGS_COLS] = ["id", "name", "language", "functions", "parse_error"]; + + logs_file.write_header(&logs_header)?; + + let iter = Mutex::new(shuffled_rows.into_iter()); + + let (tx, rx) = + crossbeam_channel::unbounded::), Error>>>(); + + map_err_debug( + crossbeam::thread::scope(|s| { + for _ in 0..threads { + s.spawn(|_| { + let my_tx = tx.clone(); + loop { + let next_item: Option> = { + let mut iter_guard = iter.lock().unwrap(); + iter_guard.next() + }; + + match next_item { + Some(row) => match row { + Ok((project_id, file_name, language)) => match analyze_file_simple( + project_id, + &file_name, + language, + fail_policy, + &word_counter, + ) { + Ok(s) => { + my_tx.send(Some(Ok(s))).unwrap(); + } + Err(e) => { + my_tx.send(Some(e.to_res())).unwrap(); + break; + } + }, + Err(row_nr) => { + let _ = my_tx.send(Some( + Error::new(&format!("Could not parse row {}", row_nr)) + .to_res(), + )); + } + }, + None => { + my_tx.send(None).unwrap(); + break; + } + } + } + }); + } + + let mut ended_threads = 0; + + let progress = ProgressBar::new(n_files as u64); + progress.set_style(map_err( + indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%"), + "Invalid progress bar style", + )?); + + while let Ok(msg) = rx.recv() { + match msg { + Some(Ok((output, opt_log))) => { + map_err( + write!(&mut output_file, "{}", output), + &format!("Error writing {}", output_path), + )?; + if let Some(log) = opt_log { + map_err( + writeln!(&mut logs_file, "{}", log), + &format!("Error writing {}", logs_path), + )?; + } + progress.inc(1); + } + Some(Err(e)) => { + e.chain("Error in child thread").to_res::<()>()?; + } + None => { + ended_threads += 1; + if ended_threads == threads { + break; + } + } + } + } + progress.finish(); + Ok(()) + }), + "Error in one of the threads", + )? +} + +/// Simplified version of analyze_file that extracts all functions without keyword filtering. +fn analyze_file_simple( + project_id: u32, + path: &str, + language: &str, + fail_policy: &str, + word_counter: &Matcher, +) -> Result<(String, Option), Error> { + match language_to_grammar(language) { + Some(grammar) => { + let mut parser: Parser = Parser::new(); + map_err(parser.set_language(&grammar.lang), "Cannot load grammar")?; + match load_file(path, 1024 * 1024 * 1024)? { + Ok(source_code) => { + let target_folder: String = format!("{}.functions_simple", path); + create_dir(&target_folder)?; + + let tree: Tree = ok_or_else( + parser.parse(&source_code, None), + &format!("Error parsing file {}", path), + )?; + + let file_has_parse_error: bool = tree.root_node().has_error(); + + if file_has_parse_error && fail_policy == "skip-file" { + Ok((String::new(), None)) + } else if file_has_parse_error && fail_policy == "abort" { + Error::new(&format!("Parse error in file {}", path)).to_res() + } else { + let root: Node<'_> = tree.root_node(); + let (output, total_functions) = extract_functions_simple( + project_id, + &root, + &target_folder, + language, + &grammar, + &source_code, + fail_policy, + word_counter, + )?; + + let error_position: String = if file_has_parse_error { + position_to_string(find_first_error_position(&root)) + } else { + "none".to_string() + }; + + Ok(( + output, + Some(format!( + "{},{},{},{},{}", + project_id, + path.replace(",", "-was_comma-") + .replace("\"", "-was_quote-"), + language, + total_functions, + error_position, + )), + )) + } + } + + Err(_) => Ok(( + String::new(), + Some(format!( + "{},{},{},-1,{}", + project_id, + path.replace(",", "-was_comma-") + .replace("\"", "-was_quote-"), + language, + "none", + )), + )), + } + } + None => Error::new(&format!("Unsupported language: {}", language)).to_res(), + } +} + +/// Simplified version of extract_functions that extracts all functions without keyword filtering. +fn extract_functions_simple( + project_id: u32, + root: &Node, + target_folder: &str, + language: &str, + grammar: &Grammar, + source: &[u8], + fail_policy: &str, + word_counter: &Matcher, +) -> Result<(String, usize), Error> { + let mut builder: String = String::new(); + let mut functions: usize = 0; + + let mut call_stack: Vec = Vec::new(); + call_stack.push(*root); + let mut cursor = root.walk(); + + while let Some(node) = call_stack.pop() { + if grammar.function_nodes.contains(node.kind()) { + let has_error: bool = node.has_error(); + + if (has_error && fail_policy == "skip-function") + || (language == "java" && find_fields(&node, "body").is_empty()) + { + continue; + } else { + let function_source_code: &[u8] = node_source_code(&node, source); + let function_position: (usize, usize) = ( + node.start_position().row + 1, + node.start_position().column + 1, + ); + + let error_position: String = if has_error { + position_to_string(find_first_error_position(&node).map(|(row, col)| { + let error_row = row - function_position.0 + 1; + if row == function_position.0 { + (error_row, col - function_position.1 + 1) + } else { + (error_row, col) + } + })) + } else { + "none".to_string() + }; + + let function_code_with_strings: &Vec = + &remove_kind_from_source(function_source_code, &node, &grammar.comment_nodes); + /* + let tree_without_comments: Tree = ok_or_else( + parser.parse(function_code_with_strings, None), + &format!( + "Error parsing code for function {}/{}", + target_folder, functions + ), + )?; + + let function_code = &remove_kind_from_source( + function_code_with_strings, + &tree_without_comments.root_node(), + &grammar.string_literal_nodes, + ); */ + + let function_path: String = format!( + "{}/{}-{}", + target_folder, function_position.0, function_position.1 + ); + + map_err( + std::fs::write(&function_path, function_source_code), + &format!("Cannot write function code to {}", function_path), + )?; + + let params_vec: Vec> = + find_first_node_of_kind(&node, &grammar.param_seq_nodes, true); + + let mut name: String = String::from_utf8_lossy( + find_first_field(&node, grammar.name_field) + .map(|n| node_source_code(&n, source)) + .unwrap_or(b""), + ) + .to_string(); + if let Some(idx) = name.find('(') { + name.truncate(idx); + } + name = name.chars().filter(|c| !c.is_whitespace()).collect(); + + let mut n_param: usize = 0; + for params in params_vec { + n_param += count_nodes_of_kind(¶ms, &grammar.param_nodes).0; + } + + map_err( + writeln!( + &mut builder, + "{},{},{},{},{},{},{},{},{}", + project_id, + &function_path + .replace(",", "-was_comma-") + .replace("\"", "-was_quote-"), + name.replace(",", "-was_comma-") + .replace("\"", "-was_quote-"), + position_to_string(Some(function_position)), + language, + count_text_lines(function_code_with_strings), + word_counter.count_matches_in_text(function_code_with_strings), + n_param, + error_position, + ), + &format!("Error writing function statistics of {}", function_path), + )?; + + functions += 1; + } + } else { + for c in node + .children(&mut cursor) + .collect::>() + .into_iter() + .rev() + { + call_stack.push(c); + } + } + } + Ok((builder, functions)) +} + +/// Returns the source code of a node in the parse tree +/// +/// # Arguments +/// +/// * `n` - The node to extract the source code from. +/// * `source` - The source code of the whole file. +fn node_source_code<'a>(n: &Node, source: &'a [u8]) -> &'a [u8] { + &source[n.start_byte()..n.end_byte()] +} + +/// Grammar of a programming language. +#[allow(dead_code)] +struct Grammar { + /// The programming language the grammar belongs to. + lang: Language, + + /// Nodes representing comments. + comment_nodes: HashSet<&'static str>, + + /// Nodes representing string literals. + string_literal_nodes: HashSet<&'static str>, + + /// Nodes representing loops. + loop_nodes: HashSet<&'static str>, + + /// Nodes representing conditional statements. + cond_nodes: HashSet<&'static str>, + + /// Nodes representing functions or methods. + function_nodes: HashSet<&'static str>, + + /// Nodes representing function or method calls. + function_call_nodes: HashSet<&'static str>, + + /// Nodes representing a sequence of parameters of a function or method. + param_seq_nodes: HashSet<&'static str>, + + /// Nodes representing a parameter of a function or method. + param_nodes: HashSet<&'static str>, + + /// The field name of the parameter type. + param_type_field: Option<&'static str>, + + /// The field name of the function or method name. + name_field: &'static str, +} + +/// Returns the grammar for the C programming language. +fn c_grammar() -> Grammar { + Grammar { + lang: tree_sitter_c::LANGUAGE.into(), + comment_nodes: vec!["comment"].into_iter().collect(), + string_literal_nodes: vec!["string_literal"].into_iter().collect(), + loop_nodes: vec!["for_statement", "while_statement", "do_statement"] + .into_iter() + .collect(), + cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"] + .into_iter() + .collect(), + function_nodes: vec!["function_definition"].into_iter().collect(), + function_call_nodes: vec!["call_expression"].into_iter().collect(), + param_seq_nodes: vec!["parameter_list"].into_iter().collect(), + param_nodes: vec!["parameter_declaration"].into_iter().collect(), + param_type_field: Some("type"), + name_field: "declarator", + } +} + +/// Returns the grammar for the C++ programming language. +fn cpp_grammar() -> Grammar { + Grammar { + lang: tree_sitter_cpp::LANGUAGE.into(), + comment_nodes: vec!["comment"].into_iter().collect(), + string_literal_nodes: vec!["string_literal"].into_iter().collect(), + loop_nodes: vec!["for_range_loop", "for_statement", "while_statement"] + .into_iter() + .collect(), + cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"] + .into_iter() + .collect(), + function_nodes: vec!["function_definition", "template_declaration"] + .into_iter() + .collect(), + function_call_nodes: vec!["call_expression"].into_iter().collect(), + param_seq_nodes: vec!["parameter_list"].into_iter().collect(), + param_nodes: vec!["parameter_declaration", "variadic_parameter_declaration"] + .into_iter() + .collect(), + param_type_field: Some("type"), + name_field: "declarator", + } +} + +/// Returns the grammar for the C# programming language. +fn cs_grammar() -> Grammar { + Grammar { + lang: tree_sitter_c_sharp::LANGUAGE.into(), + comment_nodes: vec!["comment"].into_iter().collect(), + string_literal_nodes: vec![ + "string_literal", + "verbatim_string_literal", + "raw_string_literal", + ] + .into_iter() + .collect(), + loop_nodes: vec!["for_statement", "while_statement", "do_statement"] + .into_iter() + .collect(), + cond_nodes: vec!["if_statement", "switch_statement", "conditional_expression"] + .into_iter() + .collect(), + function_nodes: vec![ + "method_declaration", + "constructor_declaration", + "operator_declaration", + ] + .into_iter() + .collect(), + function_call_nodes: vec!["invocation_expression"].into_iter().collect(), + param_seq_nodes: vec!["parameter_list"].into_iter().collect(), + param_nodes: vec!["parameter"].into_iter().collect(), + param_type_field: Some("type"), + name_field: "name", + } +} + +/// Returns the grammar for the TypeScript programming language. +fn ts_grammar() -> Grammar { + Grammar { + lang: tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(), + comment_nodes: vec!["comment"].into_iter().collect(), + string_literal_nodes: vec!["string_fragment"].into_iter().collect(), + loop_nodes: vec!["for_statement", "for_in_statement", "while_statement"] + .into_iter() + .collect(), + cond_nodes: vec!["if_statement", "switch_statement", "ternary_expression"] + .into_iter() + .collect(), + function_nodes: vec!["function_declaration", "method_definition"] + .into_iter() + .collect(), + function_call_nodes: vec![ + "new_expression", + "call_expression", + "decorator_call_expression", + ] + .into_iter() + .collect(), + param_seq_nodes: vec!["formal_parameters"].into_iter().collect(), + param_nodes: vec!["required_parameter", "optional_parameter"] + .into_iter() + .collect(), + param_type_field: Some("type"), + name_field: "name", + } +} + +/// Returns the grammar for the Go programming language. +fn go_grammar() -> Grammar { + Grammar { + lang: tree_sitter_go::LANGUAGE.into(), + comment_nodes: vec!["comment"].into_iter().collect(), + string_literal_nodes: vec!["raw_string_literal", "interpreted_string_literal"] + .into_iter() + .collect(), + loop_nodes: vec!["for_statement"].into_iter().collect(), + cond_nodes: vec![ + "if_statement", + "type_switch_statement", + "expression_switch_statement", + ] + .into_iter() + .collect(), + function_nodes: vec!["function_declaration", "method_declaration"] + .into_iter() + .collect(), + function_call_nodes: vec!["call_expression"].into_iter().collect(), + param_seq_nodes: vec!["parameter_list"].into_iter().collect(), + param_nodes: vec!["parameter_declaration", "variadic_parameter_declaration"] + .into_iter() + .collect(), + param_type_field: Some("type"), + name_field: "name", + } +} + +/// Returns the grammar for the Java programming language. +fn java_grammar() -> Grammar { + Grammar { + lang: tree_sitter_java::LANGUAGE.into(), + comment_nodes: vec!["line_comment", "block_comment"].into_iter().collect(), + string_literal_nodes: vec!["string_literal"].into_iter().collect(), + loop_nodes: vec![ + "for_statement", + "enhanced_for_statement", + "while_statement", + "do_statement", + ] + .into_iter() + .collect(), + cond_nodes: vec!["if_statement", "ternary_expression", "switch_expression"] + .into_iter() + .collect(), + function_nodes: vec!["method_declaration", "compact_constructor_declaration"] + .into_iter() + .collect(), + function_call_nodes: vec!["method_invocation", "explicit_constructor_invocation"] + .into_iter() + .collect(), + param_seq_nodes: vec!["formal_parameters"].into_iter().collect(), + param_nodes: vec!["formal_parameter"].into_iter().collect(), + param_type_field: Some("type"), + name_field: "name", + } +} + +/// Returns the grammar for the Scala programming language. +fn scala_grammar() -> Grammar { + Grammar { + lang: tree_sitter_scala::LANGUAGE.into(), + comment_nodes: vec!["comment", "block_comment"].into_iter().collect(), + string_literal_nodes: vec!["string"].into_iter().collect(), + loop_nodes: vec!["for_expression", "while_expression", "do_while_expression"] + .into_iter() + .collect(), + cond_nodes: vec!["if_expression", "match_expression"] + .into_iter() + .collect(), + function_nodes: vec!["function_definition"].into_iter().collect(), + function_call_nodes: vec!["call_expression"].into_iter().collect(), + param_seq_nodes: vec!["parameters"].into_iter().collect(), + param_nodes: vec!["parameter"].into_iter().collect(), + param_type_field: Some("type"), + name_field: "name", + } +} + +/// Returns the grammar for the Fortran programming language. +fn fortran_grammar() -> Grammar { + Grammar { + lang: tree_sitter_fortran::LANGUAGE.into(), + comment_nodes: vec!["preproc_comment", "comment"].into_iter().collect(), + string_literal_nodes: vec!["string_literal"].into_iter().collect(), + loop_nodes: vec![ + "loop_control_expression", + "where_statement", + "forall_statement", + "concurrent_statement", + "while_statement", + ] + .into_iter() + .collect(), + cond_nodes: vec![ + "if_statement", + "arithmetic_if_statement", + "select_case_statement", + "select_rank_statement", + "select_type_statement", + ] + .into_iter() + .collect(), + function_nodes: vec!["function", "subroutine"].into_iter().collect(), + function_call_nodes: vec!["call_expression", "subroutine_call"] + .into_iter() + .collect(), + param_seq_nodes: vec!["parameters"].into_iter().collect(), + param_nodes: vec!["identifier"].into_iter().collect(), + param_type_field: None, + name_field: "name", + } +} + +/// Returns the grammar for the Python programming language. +fn python_grammar() -> Grammar { + Grammar { + lang: tree_sitter_python::LANGUAGE.into(), + comment_nodes: vec!["comment"].into_iter().collect(), + string_literal_nodes: vec!["string"].into_iter().collect(), + loop_nodes: vec!["for_statement", "while_statement"] + .into_iter() + .collect(), + cond_nodes: vec!["if_statement", "conditional_expression", "match_statement"] + .into_iter() + .collect(), + function_nodes: vec!["function_definition", "lambda"].into_iter().collect(), + function_call_nodes: vec!["call"].into_iter().collect(), + param_seq_nodes: vec!["parameters"].into_iter().collect(), + param_nodes: vec!["parameter"].into_iter().collect(), + param_type_field: None, + name_field: "name", + } +} + +/// Returns the grammar corresponding to the given language. +/// +/// # Arguments +/// +/// * `language` - The language of the file. +/// +/// # Returns +/// +/// The grammar corresponding to the language or `None` if the language is not supported. +fn language_to_grammar(lang: &str) -> Option { + match lang { + "c" => Some(c_grammar()), + "c++" => Some(cpp_grammar()), + "c#" => Some(cs_grammar()), + "java" => Some(java_grammar()), + "fortran" => Some(fortran_grammar()), + "python" => Some(python_grammar()), + "typescript" => Some(ts_grammar()), + "go" => Some(go_grammar()), + "scala" => Some(scala_grammar()), + _ => None, + } +} + +/// Counts the number of nodes of given kinds in a tree. +/// +/// # Arguments +/// +/// * `node` - The root node of the tree. +/// * `kind` - The kinds of nodes to count. +/// +/// # Returns +/// +/// A tuple containing the number of nodes of the given kind and the maximum nesting level of these nodes. +/// +/// # Example +/// +/// The function applied to a node representing the following code will return `(2, 2)` if the kind is `if_statement`: +/// +/// ```c +/// int main(int a, int b) { +/// if (b > 0) { +/// if (a > b) { +/// return a; +/// } else { +/// return b; +/// } +/// } +/// return 0; +/// } +/// ``` +/// +fn count_nodes_of_kind(root: &Node, kinds: &HashSet<&str>) -> (usize, usize) { + let mut node_count = 0; + let mut max_nesting = 0; + + let mut cursor = root.walk(); + + // Simulating call stack + let mut call_stack: Vec<(Node, usize)> = Vec::new(); + call_stack.push((*root, 1)); + + while let Some((node, depth)) = call_stack.pop() { + let is_of_kind = kinds.contains(node.kind()); + + if is_of_kind { + node_count += 1; + max_nesting = max_nesting.max(depth); + } + + // We don't reverse nodes for performance (yields the same result) + for child in node.children(&mut cursor) { + call_stack.push((child, if is_of_kind { depth + 1 } else { depth })); + } + } + + (node_count, max_nesting) +} + +fn find_first_node<'a>( + node: &Node<'a>, + pred: &dyn Fn(&Node) -> bool, + breadth: bool, +) -> Vec> { + let mut cursor = node.walk(); + let mut call_stack: Vec<(Node, usize)> = Vec::new(); + call_stack.push((*node, 0)); + + let mut res: Vec> = Vec::new(); + let mut max_depth: Option = None; + + while let Some((node, depth)) = call_stack.pop() { + if max_depth.filter(|&d| depth > d).is_some() { + return res; + } else if pred(&node) { + if breadth { + res.push(node); + if max_depth.is_none() { + max_depth = Some(depth); + } + } else { + return vec![node]; + } + } else if breadth { + let mut end_queue: Vec<(Node, usize)> = + node.children(&mut cursor).map(|c| (c, depth + 1)).collect(); + end_queue.extend(call_stack); + call_stack = end_queue; + } else { + for c in node + .children(&mut cursor) + .collect::>() + .into_iter() + .rev() + { + call_stack.push((c, 0)); + } + } + } + vec![] +} + +fn find_first_node_of_kind<'a>( + root: &Node<'a>, + kind: &HashSet<&str>, + breadth: bool, +) -> Vec> { + find_first_node(root, &|n: &Node| kind.contains(n.kind()), breadth) +} + +/// Finds the first error node in the tree +/// +/// # Arguments +/// +/// * `root` - The root node of the tree. +/// +/// # Returns +/// +/// The first error node found in the tree, or `None` if no error node is found. +fn find_first_error_node<'a>(root: &Node<'a>) -> Option> { + find_first_node(root, &|n: &Node| n.is_error() || n.is_missing(), false) + .into_iter() + .next() +} + +fn find_first_error_position(root: &Node) -> Option<(usize, usize)> { + find_first_error_node(root).map(|n| (n.start_position().row + 1, n.start_position().column + 1)) +} + +fn position_to_string(position: Option<(usize, usize)>) -> String { + match position { + Some((row, col)) => format!("{}:{}", row, col), + None => "not-found".to_string(), + } +} + +fn find_fields<'a>(root: &Node<'a>, field: &str) -> Vec> { + let mut res: Vec> = Vec::new(); + let mut ids: HashSet = HashSet::new(); + + let mut cursor = root.walk(); + + // Simulating call stack + let mut call_stack: Vec = Vec::new(); + call_stack.push(*root); + + while let Some(node) = call_stack.pop() { + for c in node.children_by_field_name(field, &mut node.walk()) { + res.push(c); + ids.insert(c.id()); + } + + // We don't reverse nodes for performance (yields the same result) + for c in node + .children(&mut cursor) + .collect::>() + .into_iter() + .rev() + { + if !ids.contains(&c.id()) { + call_stack.push(c); + } + } + } + + res +} + +fn find_first_field<'a>(root: &Node<'a>, field: &str) -> Option> { + let mut cursor = root.walk(); + + // Simulating call stack + let mut call_stack: Vec = Vec::new(); + call_stack.push(*root); + + while let Some(node) = call_stack.pop() { + if let Some(c) = node.child_by_field_name(field) { + return Some(c); + } + + // We don't reverse nodes for performance (yields the same result) + for c in node + .children(&mut cursor) + .collect::>() + .into_iter() + .rev() + { + call_stack.push(c); + } + } + + None +} + +fn find_kind<'a>(root: &Node<'a>, kinds: &HashSet<&str>) -> Vec> { + let mut res: Vec> = Vec::new(); + + let mut cursor = root.walk(); + + // Simulating call stack + let mut call_stack: Vec = Vec::new(); + call_stack.push(*root); + + while let Some(node) = call_stack.pop() { + if kinds.contains(node.kind()) { + res.push(node); + } else { + // We don't reverse nodes for performance (yields the same result) + for c in node.children(&mut cursor) { + call_stack.push(c); + } + } + } + + res +} + +fn remove_kind_from_source(source: &[u8], root: &Node, kinds: &HashSet<&str>) -> Vec { + let mut nodes = find_kind(root, kinds); + nodes.sort_by_key(|b| std::cmp::Reverse(b.start_byte())); + // Disable mutability + let nodes = nodes; + + let root_start = root.start_byte(); + let mut new_source = source.to_vec(); + for n in nodes { + new_source.drain(n.start_byte() - root_start..n.end_byte() - root_start); + } + new_source +} + +//-------------------------------------------------------- + +#[cfg(test)] +mod tests { + use std::path::Path; + + use polars::prelude::SortMultipleOptions; + + use crate::utils::dataframes::*; + use crate::utils::fs::*; + + use super::*; + + const TEST_DATA: &str = "tests/data/phases/parse"; + + fn test_parse(input_file_path: &str, languages: Option>, should_pass: bool) { + let input_df = open_csv(&input_file_path, None, None); + assert!(input_df.is_ok()); + let input_df = input_df.unwrap(); + assert!(has_column(&input_df, "name")); + let input_df = input_df.column("name").unwrap().str().unwrap(); + + let output_file_path = format!("{}.functions.csv", input_file_path); + assert!(delete_file(&output_file_path, true).is_ok()); + + let logs_file_path = format!("{}.function_logs.csv", input_file_path); + assert!(delete_file(&logs_file_path, true).is_ok()); + + for path in input_df { + assert!(path.is_some()); + assert!(delete_dir(&format!("{}.functions", path.unwrap()), true).is_ok()); + } + + if should_pass { + assert!(run( + input_file_path, + None, + None, + languages, + "ignore", + 8, + 0, + false, + &mut Logger::new() + ) + .is_ok()); + + let logs_df = open_csv(&logs_file_path, None, None); + assert!(logs_df.is_ok()); + let logs_df = logs_df.unwrap(); + assert!(has_column(&logs_df, "name")); + let sorted_logs_df = logs_df + .sort(vec!["name"], SortMultipleOptions::new()) + .unwrap(); + + let expected_logs_df = open_csv( + &format!("{}.function_logs.csv.expected", input_file_path), + None, + None, + ); + assert!(expected_logs_df.is_ok()); + let expected_logs_df = expected_logs_df.unwrap(); + assert!(has_column(&expected_logs_df, "name")); + let sorted_expected_logs_df = expected_logs_df + .sort(vec!["name"], SortMultipleOptions::new()) + .unwrap(); + assert!(sorted_expected_logs_df.equals(&sorted_logs_df)); + + let output_df = open_csv(&output_file_path, None, None); + assert!(output_df.is_ok()); + let output_df = output_df.unwrap(); + assert!(has_column(&output_df, "path")); + let sorted_output_df = output_df + .sort(vec!["path"], SortMultipleOptions::new()) + .unwrap(); + + let expected_df = open_csv(&format!("{}.expected", output_file_path), None, None); + assert!(expected_df.is_ok()); + let expected_df = expected_df.unwrap(); + assert!(has_column(&expected_df, "path")); + let sorted_expected_df = expected_df + .sort(vec!["path"], SortMultipleOptions::new()) + .unwrap(); + + assert!(sorted_expected_df.equals(&sorted_output_df)); + + for path in sorted_output_df.column("path").unwrap().str().unwrap() { + assert!(path.is_some()); + let path = Path::new(path.unwrap()); + assert!(path.exists()); + let expected_path_name = format!( + "{}.expected/{}", + path.parent().unwrap().to_str().unwrap(), + path.file_name().unwrap().to_str().unwrap() + ); + let expected_path = Path::new(&expected_path_name); + assert_eq!( + std::fs::read_to_string(path).unwrap(), + std::fs::read_to_string(expected_path).unwrap() + ); + } + } else { + assert!(run( + input_file_path, + None, + None, + languages, + "ignore", + 8, + 0, + false, + &mut Logger::new() + ) + .is_err()); + } + + assert!(delete_file(&output_file_path, true).is_ok()); + assert!(delete_file(&logs_file_path, true).is_ok()); + + for path in input_df { + assert!(path.is_some()); + assert!(delete_dir(&format!("{}.functions", path.unwrap()), true).is_ok()); + } + } + + #[test] + fn parse_fp() { + let input_file_path = format!("{}/to_parse.csv", TEST_DATA); + + test_parse(&input_file_path, None, true); + } + + #[test] + fn parse_go() { + let input_file_path = format!("{}/parse_go.csv", TEST_DATA); + + test_parse(&input_file_path, None, true); + } + + #[test] + fn invalid_file() { + let input_file_path = format!("{}/invalid.csv", TEST_DATA); + + test_parse(&input_file_path, None, true); + } + + #[test] + fn invalid_lang() { + let input_file_path = format!("{}/empty.csv", TEST_DATA); + + test_parse(&input_file_path, Some(["rust"].to_vec()), false); + } + + #[test] + fn empty() { + let input_file_path = format!("{}/empty.csv", TEST_DATA); + + test_parse(&input_file_path, Some(["c"].to_vec()), true); + } +} diff --git a/src/phases/mod.rs b/src/phases/mod.rs index 03ab30c..d70647e 100644 --- a/src/phases/mod.rs +++ b/src/phases/mod.rs @@ -24,3 +24,5 @@ pub mod languages; pub mod metadata; pub mod parse; pub mod pull_request; +//pub mod tokenizer; +pub mod alt_parse; From bbfcb17a43018984acd66837af9edb4f75c40f9e Mon Sep 17 00:00:00 2001 From: swartling Date: Fri, 6 Mar 2026 16:22:42 +0100 Subject: [PATCH 02/14] implemented global count of tokens --- src/bin/main.rs | 12 ++- src/phases/mod.rs | 4 +- src/phases/tokenizer.rs | 209 ++++++++++++++++++++++++++++++++++++++++ src/utils/bow.rs | 28 ++++++ 4 files changed, 250 insertions(+), 3 deletions(-) create mode 100644 src/phases/tokenizer.rs diff --git a/src/bin/main.rs b/src/bin/main.rs index ea61c47..ba4b110 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command}; use scyros::phases::{ alt_parse, download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages, - filter_metadata, forks, ids, languages, metadata, parse, pull_request, + filter_metadata, forks, ids, languages, metadata, parse, pull_request, tokenizer, }; use scyros::utils::error::*; use scyros::utils::logger::Logger; @@ -37,6 +37,7 @@ fn cli() -> Command { .subcommand(parse::cli()) .subcommand(extract_benchmarks::cli()) .subcommand(alt_parse::cli()) + .subcommand(tokenizer::cli()) .arg( Arg::new("debug") .long("debug") @@ -231,6 +232,15 @@ fn main() { &mut logger, ) } + else if subcommand == tokenizer::cli().get_name() { + tokenizer::run( + cli_subargs.get_one::("input").unwrap(), + //cli_subargs.get_one::("output").map(|x| x.as_str()), + //cli_subargs.get_one::("language").unwrap(), + cli_subargs.get_one::("example_word").unwrap(), + &mut logger, + ) + } else { Error::new(&format!("The subcommand {} is not available. Run the program with the --help flag to see the list of subcommands", subcommand)).to_res() } diff --git a/src/phases/mod.rs b/src/phases/mod.rs index d70647e..82cf74c 100644 --- a/src/phases/mod.rs +++ b/src/phases/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod alt_parse; pub mod download; pub mod duplicate_files; pub mod duplicate_ids; @@ -24,5 +25,4 @@ pub mod languages; pub mod metadata; pub mod parse; pub mod pull_request; -//pub mod tokenizer; -pub mod alt_parse; +pub mod tokenizer; diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs new file mode 100644 index 0000000..612ed5e --- /dev/null +++ b/src/phases/tokenizer.rs @@ -0,0 +1,209 @@ +/* use std::arch::global_asm; +use std::collections::HashMap; +use std::f32::consts::E; */ + +use crate::utils::bow::*; +use crate::utils::error::*; +use crate::utils::fs::*; +use crate::utils::regex::*; +use crate::utils::{/* csv::*, */ logger::Logger}; +/* use clang::token; +use polars::frame::row; */ +use clap::{Arg, /* ArgAction, */ Command}; +use polars::prelude::*; + +/* struct Token { + word: String, + local_count: usize, + global_count: usize, + global_position: usize, +} */ + +pub fn cli() -> Command { + Command::new("tokenizer") + .about("Tokenizes the functions in the input file and generates a global Bag of Words.") + .disable_version_flag(true) + .arg( + Arg::new("input") + .short('i') + .long("input") + .value_name("INPUT_PATH") + .help("Path to the input CSV file generated by the parser.") + .required(true), + ) + .arg( + Arg::new("example_word") + .short('e') + .long("example-word") + .value_name("EXAMPLE_WORD") + .help("An example word to check the global Bag of Words for.") + .required(false) + .default_value("if"), + ) +} + +pub fn run( + input_path: &str, //path to csv provided by parser + //output_path: &str, + //language: &str, + example_word: &str, + logger: &mut Logger, +) -> Result<(), Error> { + //No checks for language yet. Just uses java for now. Will add more languages later. + let language = "java"; + let minimum_loc = 5; //temporary + //let separators = vec!["(", ")", "[", "]", "{", "}", ";", ".", ",", ":", "=", "+", "-", "*", "/", "%", "<", ">", "&", "|", "!", "?", "~", "^", "#", "$", "@", "\"", "\\", "`", "'"]; //hardcoded separators for now. Will add more later and make it configurable. + + let mut input_file = open_csv( + input_path, + Some(Schema::from_iter(vec![ + Field::new("id".into(), DataType::UInt32), + Field::new("path".into(), DataType::String), + Field::new("name".into(), DataType::String), + Field::new("position".into(), DataType::String), + Field::new("loc".into(), DataType::UInt32), + Field::new("words".into(), DataType::UInt32), + ])), + Some(vec![ + "id", "path", "name", "position", "language", "loc", "words", + ]), + )?; + + let n_functions_before_language = input_file.height(); + logger.log(&format!( + " {} functions found in the input file, filtering by selected language", + n_functions_before_language + ))?; + + //input_file = input_file.filter(&input_file.column("language")?.equal(language)); + input_file = map_err( + input_file + .lazy() + .filter(col("language").eq(lit(language))) + .collect(), + "Error filtering language", + )?; + + let n_functions_after_language = input_file.height(); + + logger.log(&format!( + " {} functions found after filtering ({:.2} %)", + n_functions_after_language, + if n_functions_before_language == 0 { + 0 + } else { + n_functions_after_language / n_functions_before_language * 100 + } + ))?; + + let n_functions_before_loc = input_file.height(); + + logger.log(&format!( + "{} functions found in the input file. Filtering those with less than {} lines of code.", + n_functions_before_loc, minimum_loc + ))?; + + //input_file = input_file.filter(&input_file.column("loc")?.greater_equal(minimum_loc))?; + + input_file = map_err( + input_file + .lazy() + .filter(col("loc").gt_eq(lit(minimum_loc))) + .collect(), + "Error filtering by lines of code", + )?; + + let n_functions_after_loc = input_file.height(); + + logger.log(&format!( + " {} functions found after filtering by lines of code ({:.2} %)", //something is weird with the percentage calculation here. + n_functions_after_loc, + if n_functions_before_loc == 0 { + 0 + } else { + n_functions_after_loc / n_functions_before_loc * 100 + } + ))?; + let global_bow = global_counter(&input_file, logger)?; + + let token_rankings = global_bow.token_rankings(); + + let example_word = example_word.to_ascii_lowercase(); + let example_word_token = example_word.as_bytes(); + + logger.log("Tokenizer seems to have completed")?; + logger.log(&format!( + " The token '{}' appears {} times and is ranked {} in the global Bag of Words.", + example_word, + token_rankings + .get(example_word_token) + .map(|(count, _)| *count) + .unwrap_or(0), + token_rankings + .get(example_word_token) + .map(|(_, rank)| *rank) + .unwrap_or(0) + ))?; + + Ok(()) +} + +fn global_counter(input_file: &DataFrame, logger: &mut Logger) -> Result { + let word_matcher: Matcher = Matcher::words_matcher(); + let mut global_bow: Bow = Bow::new(); + + for row in input_file + .column("path") + .and_then(|c| c.str()) + .unwrap() + .into_iter() + { + match row { + Some(path) => { + //let function_code = std::fs::read_to_string(path)?; + match load_file(path, 1024 * 1024 * 1024) { + Ok(Ok(function_code)) => { + let local_bow = + word_matcher.bag_of_words(&function_code.to_ascii_lowercase()); + global_bow.merge(local_bow); + } + Ok(Err(_e)) => { + logger.log(&format!(" Warning: File to large at path {}", path))?; + } + Err(_e) => { + logger.log(&format!(" Warning: Could not load file at path {}", path))?; + } + } + } + None => { + let _ = logger.log(" Warning: Path not found"); + } + } + } + + Ok(global_bow) +} + +/* fn tokenize_function( + function_code_path: &str, + separators: &Vec<&str>, + logger: &mut Logger +) -> Result<(HashMap), Error> { + let function_string = std::fs::read_to_string(function_code_path)?; + + + let mut tokenized_string = function_string.clone(); + for separator in separators { + tokenized_string = tokenized_string.replace(separator, " "); + } + + let words_in_string: Vec<&str> = tokenized_string.split_whitespace().collect(); + + let mut counts: std::collections::HashMap = std::collections::HashMap::new(); + + for word in words_in_string { + *counts.entry(word.to_string()).or_insert(0) += 1; + } + + Ok(counts) +} */ diff --git a/src/utils/bow.rs b/src/utils/bow.rs index 7676bad..4c2b347 100644 --- a/src/utils/bow.rs +++ b/src/utils/bow.rs @@ -81,6 +81,34 @@ impl Bow { .join("|") .into_bytes() } + + /// Merges another Bag of Words into this one, summing the counts of shared tokens. + /// + /// # Arguments + /// + /// * `other` - The other Bag of Words to be merged into this one. + pub fn merge(&mut self, other: Bow) { + for (token, count) in other.map { + *self.map.entry(token).or_insert(0) += count; + } + } + + /// Generates a ranking of tokens based on their frequency in the Bag of Words. + /// The ranking is a HashMap where the key is the token and the value is a tuple containing the frequency and the rank (1-based index). + /// Returns a HashMap where the key is the token and the value is a tuple containing the frequency and the rank. + pub fn token_rankings(&self) -> HashMap, (usize, usize)> { + let mut rankings: HashMap, (usize, usize)> = HashMap::new(); + let mut count_vec: Vec<(&Vec, &usize)> = self.map.iter().collect(); + //count_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count in descending order + count_vec.sort_by(|a, b| { + b.1.cmp(a.1) // primary: count descending + .then_with(|| a.0.cmp(b.0)) // secondary: token ascending + }); + for (rank, (token, count)) in count_vec.into_iter().enumerate() { + rankings.insert(token.clone(), (*count, rank + 1)); + } + rankings + } } #[cfg(test)] From c7cad4a1ee42162f42a73d4a4fa27b5d377edb63 Mon Sep 17 00:00:00 2001 From: swartling Date: Mon, 9 Mar 2026 14:34:36 +0100 Subject: [PATCH 03/14] updated code to work with refactoring of upstream repository. alt_parse removed for now --- .gitignore | 1 + src/bin/main.rs | 9 +++-- src/phases/mod.rs | 2 +- src/phases/tokenizer.rs | 75 +++++++++++++++++++---------------------- 4 files changed, 40 insertions(+), 47 deletions(-) diff --git a/.gitignore b/.gitignore index 5e969b2..487d496 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ data/ ghtokens.csv !tests/data outputs/ +src/phases/alt_parse.rs //Got broken by refactoring but is not in use currently, so ignoring for now *.zip // Negation pattern in .gitignore diff --git a/src/bin/main.rs b/src/bin/main.rs index 721b90c..cea9fb4 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -15,7 +15,7 @@ use anyhow::{anyhow, Context, Result}; use clap::{Arg, ArgAction, Command}; use scyros::phases::{ - alt_parse, download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages, + download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages, filter_metadata, forks, ids, languages, metadata, parse, pull_request, tokenizer, }; use scyros::utils::logger::Logger; @@ -37,7 +37,6 @@ fn cli() -> Command { .subcommand(duplicate_files::cli()) .subcommand(parse::cli()) .subcommand(extract_benchmarks::cli()) - .subcommand(alt_parse::cli()) .subcommand(tokenizer::cli()) .arg( Arg::new("debug") @@ -216,7 +215,7 @@ fn main() { &logger, ) } - else if subcommand == alt_parse::cli().get_name() { + /* else if subcommand == alt_parse::cli().get_name() { alt_parse::run( cli_subargs.get_one::("input").unwrap(), cli_subargs.get_one::("output").map(|x| x.as_str()), @@ -232,14 +231,14 @@ fn main() { cli_subargs.get_flag("force"), &mut logger, ) - } + } */ else if subcommand == tokenizer::cli().get_name() { tokenizer::run( cli_subargs.get_one::("input").unwrap(), //cli_subargs.get_one::("output").map(|x| x.as_str()), //cli_subargs.get_one::("language").unwrap(), cli_subargs.get_one::("example_word").unwrap(), - &mut logger, + &logger, ) } else { diff --git a/src/phases/mod.rs b/src/phases/mod.rs index 82cf74c..58c2135 100644 --- a/src/phases/mod.rs +++ b/src/phases/mod.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod alt_parse; +//pub mod alt_parse; pub mod download; pub mod duplicate_files; pub mod duplicate_ids; diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs index 612ed5e..61a16dc 100644 --- a/src/phases/tokenizer.rs +++ b/src/phases/tokenizer.rs @@ -3,17 +3,18 @@ use std::collections::HashMap; use std::f32::consts::E; */ use crate::utils::bow::*; -use crate::utils::error::*; use crate::utils::fs::*; use crate::utils::regex::*; use crate::utils::{/* csv::*, */ logger::Logger}; +use anyhow::{/* anyhow, bail, Context, */ Result}; +use tracing::info; /* use clang::token; use polars::frame::row; */ use clap::{Arg, /* ArgAction, */ Command}; use polars::prelude::*; /* struct Token { - word: String, + word: Vec, local_count: usize, global_count: usize, global_position: usize, @@ -47,8 +48,8 @@ pub fn run( //output_path: &str, //language: &str, example_word: &str, - logger: &mut Logger, -) -> Result<(), Error> { + _logger: &Logger, //not used currently but hopefully will later +) -> Result<()> { //No checks for language yet. Just uses java for now. Will add more languages later. let language = "java"; let minimum_loc = 5; //temporary @@ -70,69 +71,61 @@ pub fn run( )?; let n_functions_before_language = input_file.height(); - logger.log(&format!( - " {} functions found in the input file, filtering by selected language", + info!( + "{} functions found in the input file, filtering by selected language", n_functions_before_language - ))?; + ); //input_file = input_file.filter(&input_file.column("language")?.equal(language)); - input_file = map_err( - input_file - .lazy() - .filter(col("language").eq(lit(language))) - .collect(), - "Error filtering language", - )?; + input_file = input_file + .lazy() + .filter(col("language").eq(lit(language))) + .collect()?; let n_functions_after_language = input_file.height(); - - logger.log(&format!( - " {} functions found after filtering ({:.2} %)", + info!( + " {} files found after filtering ({:.2} %)", n_functions_after_language, if n_functions_before_language == 0 { 0 } else { n_functions_after_language / n_functions_before_language * 100 } - ))?; - + ); let n_functions_before_loc = input_file.height(); - logger.log(&format!( - "{} functions found in the input file. Filtering those with less than {} lines of code.", - n_functions_before_loc, minimum_loc - ))?; + info!(" {} functions found after filtering by language, filtering functions with less that {} lines of code.", n_functions_before_loc, minimum_loc); //input_file = input_file.filter(&input_file.column("loc")?.greater_equal(minimum_loc))?; - input_file = map_err( - input_file - .lazy() - .filter(col("loc").gt_eq(lit(minimum_loc))) - .collect(), - "Error filtering by lines of code", - )?; + input_file = input_file + .lazy() + .filter(col("loc").gt_eq(lit(minimum_loc))) + .collect()?; let n_functions_after_loc = input_file.height(); - logger.log(&format!( - " {} functions found after filtering by lines of code ({:.2} %)", //something is weird with the percentage calculation here. + info!( + "{} functions found after filtering ({:.2} %)", //something is weird with the percentage calculation here. n_functions_after_loc, if n_functions_before_loc == 0 { 0 } else { n_functions_after_loc / n_functions_before_loc * 100 } - ))?; - let global_bow = global_counter(&input_file, logger)?; + ); + let global_bow = global_counter(&input_file)?; let token_rankings = global_bow.token_rankings(); let example_word = example_word.to_ascii_lowercase(); let example_word_token = example_word.as_bytes(); - logger.log("Tokenizer seems to have completed")?; - logger.log(&format!( + info!( + "Global Bag of Words generated. Checking for example word '{}'", + example_word + ); + info!( " The token '{}' appears {} times and is ranked {} in the global Bag of Words.", example_word, token_rankings @@ -143,12 +136,12 @@ pub fn run( .get(example_word_token) .map(|(_, rank)| *rank) .unwrap_or(0) - ))?; + ); Ok(()) } -fn global_counter(input_file: &DataFrame, logger: &mut Logger) -> Result { +fn global_counter(input_file: &DataFrame) -> Result { let word_matcher: Matcher = Matcher::words_matcher(); let mut global_bow: Bow = Bow::new(); @@ -168,15 +161,15 @@ fn global_counter(input_file: &DataFrame, logger: &mut Logger) -> Result { - logger.log(&format!(" Warning: File to large at path {}", path))?; + info!(" Warning: File to large at path {}", path); } Err(_e) => { - logger.log(&format!(" Warning: Could not load file at path {}", path))?; + info!(" Warning: Could not load file at path {}", path); } } } None => { - let _ = logger.log(" Warning: Path not found"); + info!(" Warning: Path not found"); } } } From 1d36c5aa445793688da8ba7af4d5a9d355f874d4 Mon Sep 17 00:00:00 2001 From: swartling Date: Mon, 9 Mar 2026 15:38:00 +0100 Subject: [PATCH 04/14] separated main function from cli call --- src/phases/tokenizer.rs | 63 ++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs index 61a16dc..3bed8df 100644 --- a/src/phases/tokenizer.rs +++ b/src/phases/tokenizer.rs @@ -6,13 +6,13 @@ use crate::utils::bow::*; use crate::utils::fs::*; use crate::utils::regex::*; use crate::utils::{/* csv::*, */ logger::Logger}; -use anyhow::{/* anyhow, bail, Context, */ Result}; +use anyhow::{anyhow, /*bail, Context, */ Result}; use tracing::info; /* use clang::token; use polars::frame::row; */ use clap::{Arg, /* ArgAction, */ Command}; use polars::prelude::*; - +use std::collections::HashMap; /* struct Token { word: Vec, local_count: usize, @@ -38,18 +38,42 @@ pub fn cli() -> Command { .long("example-word") .value_name("EXAMPLE_WORD") .help("An example word to check the global Bag of Words for.") - .required(false) - .default_value("if"), + .required(false), ) } -pub fn run( +pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()> { + let token_rankings = run_tokenizer(input_path, logger)?; + + let example_word = example_word.to_ascii_lowercase(); + let example_word_token = example_word.as_bytes(); + + info!( + "Global Bag of Words generated. Checking for example word '{}'", + example_word + ); + info!( + " The token '{}' appears {} times and is ranked {} in the global Bag of Words.", + example_word, + token_rankings + .get(example_word_token) + .map(|(count, _)| *count) + .unwrap_or(0), + token_rankings + .get(example_word_token) + .map(|(_, rank)| *rank) + .unwrap_or(0) + ); + + Ok(()) +} + +pub fn run_tokenizer( input_path: &str, //path to csv provided by parser //output_path: &str, //language: &str, - example_word: &str, _logger: &Logger, //not used currently but hopefully will later -) -> Result<()> { +) -> Result, (usize, usize)>> { //No checks for language yet. Just uses java for now. Will add more languages later. let language = "java"; let minimum_loc = 5; //temporary @@ -116,29 +140,10 @@ pub fn run( ); let global_bow = global_counter(&input_file)?; - let token_rankings = global_bow.token_rankings(); - - let example_word = example_word.to_ascii_lowercase(); - let example_word_token = example_word.as_bytes(); + let token_rankings: std::collections::HashMap, (usize, usize)> = + global_bow.token_rankings(); - info!( - "Global Bag of Words generated. Checking for example word '{}'", - example_word - ); - info!( - " The token '{}' appears {} times and is ranked {} in the global Bag of Words.", - example_word, - token_rankings - .get(example_word_token) - .map(|(count, _)| *count) - .unwrap_or(0), - token_rankings - .get(example_word_token) - .map(|(_, rank)| *rank) - .unwrap_or(0) - ); - - Ok(()) + Some(token_rankings).ok_or_else(|| anyhow!("No tokens found in the global Bag of Words.")) } fn global_counter(input_file: &DataFrame) -> Result { From b830c46ee7fd38302c80fb8c34349f7df38525f4 Mon Sep 17 00:00:00 2001 From: swartling Date: Mon, 9 Mar 2026 15:55:24 +0100 Subject: [PATCH 05/14] run_tokenizer now take a dataframe instead of an input_path --- src/phases/mod.rs | 1 + src/phases/tokenizer.rs | 68 +++++++++++++++++++++-------------------- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/src/phases/mod.rs b/src/phases/mod.rs index 58c2135..ffc02b3 100644 --- a/src/phases/mod.rs +++ b/src/phases/mod.rs @@ -26,3 +26,4 @@ pub mod metadata; pub mod parse; pub mod pull_request; pub mod tokenizer; +//pub mod type_3_duplicate_files; diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs index 3bed8df..21900c4 100644 --- a/src/phases/tokenizer.rs +++ b/src/phases/tokenizer.rs @@ -43,38 +43,6 @@ pub fn cli() -> Command { } pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()> { - let token_rankings = run_tokenizer(input_path, logger)?; - - let example_word = example_word.to_ascii_lowercase(); - let example_word_token = example_word.as_bytes(); - - info!( - "Global Bag of Words generated. Checking for example word '{}'", - example_word - ); - info!( - " The token '{}' appears {} times and is ranked {} in the global Bag of Words.", - example_word, - token_rankings - .get(example_word_token) - .map(|(count, _)| *count) - .unwrap_or(0), - token_rankings - .get(example_word_token) - .map(|(_, rank)| *rank) - .unwrap_or(0) - ); - - Ok(()) -} - -pub fn run_tokenizer( - input_path: &str, //path to csv provided by parser - //output_path: &str, - //language: &str, - _logger: &Logger, //not used currently but hopefully will later -) -> Result, (usize, usize)>> { - //No checks for language yet. Just uses java for now. Will add more languages later. let language = "java"; let minimum_loc = 5; //temporary //let separators = vec!["(", ")", "[", "]", "{", "}", ";", ".", ",", ":", "=", "+", "-", "*", "/", "%", "<", ">", "&", "|", "!", "?", "~", "^", "#", "$", "@", "\"", "\\", "`", "'"]; //hardcoded separators for now. Will add more later and make it configurable. @@ -138,7 +106,41 @@ pub fn run_tokenizer( n_functions_after_loc / n_functions_before_loc * 100 } ); - let global_bow = global_counter(&input_file)?; + + let token_rankings = run_tokenizer(&input_file, logger)?; + + let example_word = example_word.to_ascii_lowercase(); + let example_word_token = example_word.as_bytes(); + + info!( + "Global Bag of Words generated. Checking for example word '{}'", + example_word + ); + info!( + " The token '{}' appears {} times and is ranked {} in the global Bag of Words.", + example_word, + token_rankings + .get(example_word_token) + .map(|(count, _)| *count) + .unwrap_or(0), + token_rankings + .get(example_word_token) + .map(|(_, rank)| *rank) + .unwrap_or(0) + ); + + Ok(()) +} + +pub fn run_tokenizer( + input_file: &DataFrame, + //output_path: &str, + //language: &str, + _logger: &Logger, //not used currently but hopefully will later +) -> Result, (usize, usize)>> { + //No checks for language yet. Just uses java for now. Will add more languages later. + + let global_bow = global_counter(input_file)?; let token_rankings: std::collections::HashMap, (usize, usize)> = global_bow.token_rankings(); From 5f3fd89ea31f2df1e801b5e8e5ceea2ee43f4ef8 Mon Sep 17 00:00:00 2001 From: swartling Date: Fri, 20 Mar 2026 16:18:17 +0100 Subject: [PATCH 06/14] inverted index and candidate map structures built --- src/bin/main.rs | 16 ++ src/phases/mod.rs | 2 +- src/phases/tokenizer.rs | 16 +- src/phases/type_3_duplicate_files.rs | 314 +++++++++++++++++++++++++++ src/utils/bow.rs | 5 + src/utils/candidate_map.rs | 77 +++++++ src/utils/inverted_index.rs | 54 +++++ src/utils/mod.rs | 2 + 8 files changed, 477 insertions(+), 9 deletions(-) create mode 100644 src/phases/type_3_duplicate_files.rs create mode 100644 src/utils/candidate_map.rs create mode 100644 src/utils/inverted_index.rs diff --git a/src/bin/main.rs b/src/bin/main.rs index cea9fb4..883911a 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -17,6 +17,7 @@ use clap::{Arg, ArgAction, Command}; use scyros::phases::{ download, duplicate_files, duplicate_ids, extract_benchmarks, filter_languages, filter_metadata, forks, ids, languages, metadata, parse, pull_request, tokenizer, + type_3_duplicate_files, }; use scyros::utils::logger::Logger; use tracing::{error, info}; @@ -38,6 +39,7 @@ fn cli() -> Command { .subcommand(parse::cli()) .subcommand(extract_benchmarks::cli()) .subcommand(tokenizer::cli()) + .subcommand(type_3_duplicate_files::cli()) .arg( Arg::new("debug") .long("debug") @@ -241,6 +243,20 @@ fn main() { &logger, ) } + else if subcommand == type_3_duplicate_files::cli().get_name() { + type_3_duplicate_files::run( + cli_subargs.get_one::("input").unwrap(), + cli_subargs.get_one::("output").map(|x| x.as_str()), + cli_subargs.get_one::("map").map(|x| x.as_str()), + cli_subargs.get_one::("logs").map(|x| x.as_str()), + /* languages */ + *cli_subargs.get_one::("threads").unwrap(), + *cli_subargs.get_one::("p_prefix").unwrap(), + *cli_subargs.get_one::("threshold").unwrap(), + cli_subargs.get_one::("example_word"), + &logger, + ) + } else { Err(anyhow!("The subcommand {} is not available. Run the program with the --help flag to see the list of subcommands", subcommand)) } diff --git a/src/phases/mod.rs b/src/phases/mod.rs index ffc02b3..14c1323 100644 --- a/src/phases/mod.rs +++ b/src/phases/mod.rs @@ -26,4 +26,4 @@ pub mod metadata; pub mod parse; pub mod pull_request; pub mod tokenizer; -//pub mod type_3_duplicate_files; +pub mod type_3_duplicate_files; diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs index 21900c4..89f41ee 100644 --- a/src/phases/tokenizer.rs +++ b/src/phases/tokenizer.rs @@ -6,13 +6,13 @@ use crate::utils::bow::*; use crate::utils::fs::*; use crate::utils::regex::*; use crate::utils::{/* csv::*, */ logger::Logger}; -use anyhow::{anyhow, /*bail, Context, */ Result}; +use anyhow::{/*anyhow, bail, Context, */ Result}; use tracing::info; /* use clang::token; use polars::frame::row; */ use clap::{Arg, /* ArgAction, */ Command}; use polars::prelude::*; -use std::collections::HashMap; +//use std::collections::HashMap; /* struct Token { word: Vec, local_count: usize, @@ -42,7 +42,7 @@ pub fn cli() -> Command { ) } -pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()> { +pub fn run(input_path: &str, example_word: &str, _logger: &Logger) -> Result<()> { let language = "java"; let minimum_loc = 5; //temporary //let separators = vec!["(", ")", "[", "]", "{", "}", ";", ".", ",", ":", "=", "+", "-", "*", "/", "%", "<", ">", "&", "|", "!", "?", "~", "^", "#", "$", "@", "\"", "\\", "`", "'"]; //hardcoded separators for now. Will add more later and make it configurable. @@ -107,8 +107,8 @@ pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()> } ); - let token_rankings = run_tokenizer(&input_file, logger)?; - + //let token_rankings = run_tokenizer(&input_file, logger)?; + let token_rankings = global_counter(&input_file)?.token_rankings(); let example_word = example_word.to_ascii_lowercase(); let example_word_token = example_word.as_bytes(); @@ -132,7 +132,7 @@ pub fn run(input_path: &str, example_word: &str, logger: &Logger) -> Result<()> Ok(()) } -pub fn run_tokenizer( +/* pub fn run_tokenizer( input_file: &DataFrame, //output_path: &str, //language: &str, @@ -146,9 +146,9 @@ pub fn run_tokenizer( global_bow.token_rankings(); Some(token_rankings).ok_or_else(|| anyhow!("No tokens found in the global Bag of Words.")) -} +} */ -fn global_counter(input_file: &DataFrame) -> Result { +pub fn global_counter(input_file: &DataFrame) -> Result { let word_matcher: Matcher = Matcher::words_matcher(); let mut global_bow: Bow = Bow::new(); diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs new file mode 100644 index 0000000..0618d67 --- /dev/null +++ b/src/phases/type_3_duplicate_files.rs @@ -0,0 +1,314 @@ +use crate::phases::tokenizer::global_counter; +use crate::utils::fs::*; +use crate::utils::inverted_index::*; +use crate::utils::logger::Logger; +use crate::utils::regex::*; +use anyhow::{/* Error, */ Result}; +use blake3; +use clap::{Arg, Command}; +use core::f64; +use polars::prelude::*; +use std::cmp::Reverse; +use std::collections::HashMap; +use std::vec; +use tracing::info; + +pub fn cli() -> Command { + Command::new("type_3_duplicate_files") + .about("Detects type 3 clones by building an index based on the most common tokens in the functions and their frequencies.") + .disable_version_flag(true) + .arg( + Arg::new("input") + .short('i') + .long("input") + .help("Path to the input CSV file generated by the parser.") + .required(true), + ) + .arg( + Arg::new("output") + .short('o') + .long("output") + .help("Path to the output CSV file to store unique files metadata.") + .required(false), + ) + .arg( + Arg::new("map") + .short('m') + .long("map") + .help("Path to the map CSV file to store the mapping of clones to their originals.") + .required(false), + ) + .arg( + Arg::new("logs") + .short('l') + .long("logs") + .help("Path to the logs file to store error logs.") + .required(false), + ) + .arg( + Arg::new("languages") + .short('g') + .long("languages") + .help("Comma-separated list of languages to consider. If not provided, all languages will be considered.") + .required(false), + ) + .arg( + Arg::new("threads") + .short('n') + .help("Number of threads to use, default is 1.") + .default_value("1") + .value_parser(clap::value_parser!(usize)) + ) + .arg( + Arg::new("p_prefix") + .short('p') + .long("p_prefix") + .default_value("1") + .help("Number of tokens to consider for the prefix. Default is 1.") + .value_parser(clap::value_parser!(usize)) + ) + .arg( + Arg::new("threshold") + .short('t') + .long("threshold") + .help("Similarity threshold. Default is 0.8.") + .default_value("0.8") + .value_parser(clap::value_parser!(f64)) + ) + .arg( + Arg::new("example_word") + .short('e') + .long("example-word") + .help("An example word to check the global Bag of Words for.") + .required(false), + ) +} + +pub fn run( + input_path: &str, //The path to the input CSV file storing the file paths, output of the parser phase + _output_path: Option<&str>, //optional path to the output CSV file to store unique files metadata. + _map_path: Option<&str>, //optional path to the map CSV file to store the mapping of clones to their originals. + _logs_path: Option<&str>, //for error logs, not implemented yet + /* _opt_languages: Option>, //optional list of languages. Currently java is hardcoded */ + _threads: usize, //current implementation is single-threaded + p_prefix: usize, //number of tokens to consider for the prefix, default is 1 + threshold: f64, //threshold for the prefix length, default is 0.8 + example_word: Option<&String>, //an example word to check the global Bag of Words for, optional + _logger: &Logger, +) -> Result<()> { + let language = "java"; + let minimum_loc = 5; //temporary + let mut input_file = open_csv( + input_path, + Some(Schema::from_iter(vec![ + Field::new("id".into(), DataType::UInt32), + Field::new("path".into(), DataType::String), + Field::new("name".into(), DataType::String), + Field::new("position".into(), DataType::String), + Field::new("loc".into(), DataType::UInt32), + Field::new("words".into(), DataType::UInt32), + ])), + Some(vec![ + "id", "path", "name", "position", "language", "loc", "words", + ]), + )?; + + let n_functions_before_language = input_file.height(); + info!( + "{} functions found in the input file, filtering by selected language", + n_functions_before_language + ); + + //input_file = input_file.filter(&input_file.column("language")?.equal(language)); + input_file = input_file + .lazy() + .filter(col("language").eq(lit(language))) + .collect()?; + + let n_functions_after_language = input_file.height(); + info!( + " {} files found after filtering ({:.2} %)", + n_functions_after_language, + if n_functions_before_language == 0 { + 0 + } else { + (n_functions_after_language as f64 / n_functions_before_language as f64 * 100.0) + as usize + } + ); + let n_functions_before_loc = input_file.height(); + + info!(" {} functions found after filtering by language, filtering functions with less that {} lines of code.", n_functions_before_loc, minimum_loc); + + //input_file = input_file.filter(&input_file.column("loc")?.greater_equal(minimum_loc))?; + + input_file = input_file + .lazy() + .filter(col("loc").gt_eq(lit(minimum_loc))) + .collect()?; + + let n_functions_after_loc = input_file.height(); + + info!( + "{} functions found after filtering ({:.2} %)", //something is weird with the percentage calculation here. + n_functions_after_loc, + if n_functions_before_loc == 0 { + 0 + } else { + (n_functions_after_loc as f64 / n_functions_before_loc as f64 * 100.0) as usize + } + ); + let global_bow = global_counter(&input_file)?; + let token_rankings = global_bow.token_rankings(); + let vector_of_indices_plus_min_max = + index_builder(&input_file, token_rankings, p_prefix, threshold)?; + // Maximum and minimum 'words' in input file + let vector_of_indices = vector_of_indices_plus_min_max.0; + let min_words = vector_of_indices_plus_min_max.1 .0; + let max_words = vector_of_indices_plus_min_max.1 .1; + info!( + "Built {} indices with prefix scheme from 1 to {}, minimum words in a function: {}, maximum words in a function: {}.", + vector_of_indices.len(), + p_prefix, + min_words, + max_words + ); + + if let Some(word) = example_word { + let word = word.to_owned().as_bytes().to_ascii_lowercase(); + let mut index_number = 1; + for index in vector_of_indices.iter() { + info!( + "Index {} has {} entries, total length of vectors in entries: {}", + index_number, + index.len(), + index.len_tokens() + ); + if let Some(entries) = index.get(&word) { + info!( + "Entries for the example word '{}' in index {}ยง:", + String::from_utf8_lossy(&word), + index_number + ); + for (function_id, count) in entries { + info!("Function ID: {}, Count: {}", function_id, count); + } + } else { + info!( + "The example word '{}' was not found in index {}.", + String::from_utf8_lossy(&word), + index_number + ); + } + index_number += 1; + } + } + + //go through input file again? Means i can grab 'words' from the file. Could do something like just checking candidates + + Ok(()) +} + +fn index_builder( + input_file: &DataFrame, + token_rankings: HashMap, (usize, usize)>, + p_prefix: usize, + threshold: f64, +) -> Result<(Vec, (usize, usize))> { + let word_matcher: Matcher = Matcher::words_matcher(); + + let mut vector_of_indices: Vec = Vec::new(); + for _i in 1..=p_prefix { + vector_of_indices.push(InvertedIndex::new()); + //info!("Initialized index {}.", _i); + } + let mut min_words = usize::MAX; + let mut max_words = 0; + for path in input_file + .column("path") + .and_then(|c| c.str()) + .unwrap() + .into_iter() + .flatten() + { + match load_file(path, 1024 * 1024 * 1024) { + Ok(Ok(function_code)) => { + let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase()); + let mut vectored_bow = local_bow.vectorize(); + vectored_bow.sort_by_key(|(token, _)| { + Reverse( + token_rankings + .get(token) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX), + ) + }); + let codeblock_length = vectored_bow.iter().map(|(_, count)| count).sum::(); + + // Min and Max codeblock are in number of words, not tokens but seeing as they're only used for estimating verification cost they don't need to be precise + if codeblock_length < min_words { + min_words = codeblock_length; + } + if codeblock_length > max_words { + max_words = codeblock_length; + } + let _verification_cost_per_candidate_estimate = + (min_words as f64 + max_words as f64) / 2.0; + //Temporarily shut off so the compiler doesn't complain about unused variables, will be used later + let prefix_length = + codeblock_length - ((codeblock_length as f64) * threshold).round() as usize + 1; + + let mut cumulative_count = 0; + let mut p = 1; + let function_id = blake3::hash(path.as_bytes()); + //info!("Prefix length: {}, total tokens: {}, codeblock length: {}", prefix_length, vectored_bow.len(), codeblock_length); + + for (token, count) in vectored_bow { + cumulative_count += count; + vector_of_indices[p - 1].add(&token, count, function_id); + if cumulative_count >= prefix_length { + if p == p_prefix { + //info!("Prefix scheme {} added token {} with count {}", p, String::from_utf8_lossy(&token), count); + break; + } else { + p += 1; + } + } + } + } + Ok(Err(_e)) => { + info!("Warning: File too large at path '{}', skipping.", path); + } + Err(_e) => { + info!("Failed to read file at path '{}', skipping.", path); + } + } + } + info!("Finished building indices."); + Ok((vector_of_indices, (min_words, max_words))) +} + +/* fn delta_filter_cost( + prefix_vector: &Vec<(Vec, usize)>, + vector_of_indices: &Vec, + p_prefix: usize, + &previous_cost: &usize, +) -> usize { + let mut cost = 0; + if p_prefix == 1 { + for (token, _) in prefix_vector { + cost += vector_of_indices[0].token_frequency(token, false); + } + } else { + let last_token = prefix_vector.last().unwrap().0.clone(); + for (token, _) in prefix_vector { + cost += vector_of_indices[p_prefix - 1].token_frequency(token, false); + } + for p in 1..(p_prefix - 1) { + // the previous for-loop already counted the last token for the current inverted_index + cost += vector_of_indices[p - 1].token_frequency(&last_token, false); + } + } + let total_cost = previous_cost + cost; + total_cost +} */ diff --git a/src/utils/bow.rs b/src/utils/bow.rs index 4c2b347..897b0a8 100644 --- a/src/utils/bow.rs +++ b/src/utils/bow.rs @@ -109,6 +109,11 @@ impl Bow { } rankings } + + pub fn vectorize(self) -> Vec<(Vec, usize)> { + let vector: Vec<(Vec, usize)> = self.map.into_iter().collect(); + vector + } } #[cfg(test)] diff --git a/src/utils/candidate_map.rs b/src/utils/candidate_map.rs new file mode 100644 index 0000000..47f0a77 --- /dev/null +++ b/src/utils/candidate_map.rs @@ -0,0 +1,77 @@ +use std::collections::{HashMap, HashSet}; + +pub struct CandidateEntry { + pub matches: usize, + // Not sure if I want matches here as well or just in match_histogram. + //The benefit of having it here is that I can easily find what bucket the candidate is in in the histogram. + //The downside is that I have to update it here O(log n) when I update it in the histogram. + pub length: usize, + pub last_token_seen_pos: usize, +} + +pub struct CandidateMap { + entries: HashMap, + match_histogram: HashMap>, +} + +impl Default for CandidateMap { + fn default() -> Self { + CandidateMap::new() + } +} + +impl CandidateMap { + pub fn new() -> Self { + Self { + entries: HashMap::new(), + match_histogram: HashMap::new(), + } + } + + pub fn add_candidate( + &mut self, + function_id: blake3::Hash, + length: usize, + new_matches: usize, + last_token_seen_pos: usize, + ) { + let entry = self.entries.entry(function_id).or_insert(CandidateEntry { + matches: 0, + length, + last_token_seen_pos, + }); + + // Update the match histogram + if entry.matches > 0 { + if let Some(bucket) = self.match_histogram.get_mut(&entry.matches) { + bucket.remove(&function_id); + } + } + + entry.matches += new_matches; + entry.length = length; + entry.last_token_seen_pos = last_token_seen_pos; + + self.match_histogram + .entry(entry.matches) + .or_default() + .insert(function_id); + } + + pub fn count_candidates_with_n_matches(&self, n: usize, mode: &str) -> usize { + if mode == "exact" { + self.match_histogram + .get(&n) + .map(|bucket| bucket.len()) + .unwrap_or(0) + } else if mode == "at_least" { + self.match_histogram + .iter() + .filter(|(&matches, _)| matches >= n) + .map(|(_, bucket)| bucket.len()) + .sum() + } else { + panic!("Invalid mode: {}", mode); + } + } +} diff --git a/src/utils/inverted_index.rs b/src/utils/inverted_index.rs new file mode 100644 index 0000000..a976fd3 --- /dev/null +++ b/src/utils/inverted_index.rs @@ -0,0 +1,54 @@ +use blake3::Hash; +use std::collections::HashMap; +pub struct InvertedIndex { + map: HashMap, Vec<(Hash, usize)>>, // Maps tokens to a list of function IDs where they appear as well as the frequency of the token in that function +} + +impl Default for InvertedIndex { + fn default() -> Self { + InvertedIndex::new() + } +} + +impl InvertedIndex { + pub fn new() -> Self { + InvertedIndex { + map: HashMap::default(), + } + } + + pub fn add(&mut self, token: &Vec, count: usize, function_id: Hash) { + self.map + .entry(token.to_owned()) + .or_default() + .push((function_id, count)); + } + + pub fn get(&self, token: &Vec) -> Option<&Vec<(Hash, usize)>> { + self.map.get(token) + } + + pub fn len(&self) -> usize { + self.map.len() + } + + pub fn len_tokens(&self) -> usize { + self.map.values().map(|v| v.len()).sum() + } + + pub fn is_empty(&self) -> bool { + self.map.is_empty() + } + + pub fn token_frequency(&self, token: &Vec, count_duplicates: bool) -> usize { + if let Some(functions) = self.get(token) { + if count_duplicates { + functions.iter().map(|(_, count)| *count).sum() + } else { + functions.len() + } + } else { + 0 + } + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 3d01ca7..84192f7 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -13,11 +13,13 @@ // limitations under the License. pub mod bow; +pub mod candidate_map; pub mod csv; pub mod dataframes; pub mod fs; pub mod github; pub mod github_api; +pub mod inverted_index; pub mod json; pub mod logger; pub mod regex; From ba0fc06971f457292bbec63efae2831622448aa7 Mon Sep 17 00:00:00 2001 From: swartling Date: Mon, 23 Mar 2026 14:25:30 +0100 Subject: [PATCH 07/14] renamed rust-toolchain back --- dummy_rust-toolchain.toml => rust-toolchain.toml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dummy_rust-toolchain.toml => rust-toolchain.toml (100%) diff --git a/dummy_rust-toolchain.toml b/rust-toolchain.toml similarity index 100% rename from dummy_rust-toolchain.toml rename to rust-toolchain.toml From 941aabaada59a61fa075d7309a7c9743a973bff9 Mon Sep 17 00:00:00 2001 From: swartling Date: Tue, 7 Apr 2026 15:56:20 +0200 Subject: [PATCH 08/14] added detect_clones and verify_clones with accompanying data structures --- .gitignore | 3 +- src/phases/type_3_duplicate_files.rs | 378 ++++++++++++++++++++++++--- src/utils/candidate_map.rs | 133 +++++++++- src/utils/inverted_index.rs | 26 +- 4 files changed, 483 insertions(+), 57 deletions(-) diff --git a/.gitignore b/.gitignore index 488e813..d4e6d87 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,5 @@ outputs/ src/phases/alt_parse.rs //Got broken by refactoring but is not in use currently, so ignoring for now *.zip *.tar.gz -result/ \ No newline at end of file +result/ +examples/ \ No newline at end of file diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs index 78ccd3d..4ea31b9 100644 --- a/src/phases/type_3_duplicate_files.rs +++ b/src/phases/type_3_duplicate_files.rs @@ -1,4 +1,5 @@ use crate::phases::tokenizer::global_counter; +use crate::utils::candidate_map::*; use crate::utils::fs::*; use crate::utils::inverted_index::*; use crate::utils::logger::Logger; @@ -8,8 +9,8 @@ use blake3; use clap::{Arg, Command}; use core::f64; use polars::prelude::*; -use std::cmp::Reverse; -use std::collections::HashMap; +use std::cmp::{max, min, Reverse}; +use std::collections::{HashMap, HashSet}; use std::vec; use tracing::info; @@ -158,10 +159,33 @@ pub fn run( (n_functions_after_loc as f64 / n_functions_before_loc as f64 * 100.0) as usize } ); + + //moved here from detect_clones + let paths_column = input_file.column("path")?.str()?; + let words_column = input_file.column("words")?.u32()?; + let rows: Vec<(&str, usize)> = paths_column + .into_iter() + .zip(words_column) + .filter_map(|(path_opt, words_opt)| match (path_opt, words_opt) { + (Some(path), Some(words)) => Some((path, words as usize)), + _ => None, + }) + .collect(); + + let function_paths_and_lengths: HashMap = rows + .iter() + .map(|(path, words)| (blake3::hash(path.as_bytes()), (*path, *words))) + .collect(); + let global_bow = global_counter(&input_file)?; let token_rankings = global_bow.token_rankings(); - let vector_of_indices_plus_min_max = - index_builder(&input_file, token_rankings, p_prefix, threshold)?; + let vector_of_indices_plus_min_max = index_builder( + &input_file, + &token_rankings, + p_prefix, + threshold, + &function_paths_and_lengths, + )?; // Maximum and minimum 'words' in input file let vector_of_indices = vector_of_indices_plus_min_max.0; let min_words = vector_of_indices_plus_min_max.1 .0; @@ -190,8 +214,11 @@ pub fn run( String::from_utf8_lossy(&word), index_number ); - for (function_id, count) in entries { - info!("Function ID: {}, Count: {}", function_id, count); + for (function_id, count, (token_position, cumulative_count)) in entries { + info!( + "Function ID: {}, Count: {}, Token Position: {}, Cumulative Count: {}", + function_id, count, token_position, cumulative_count + ); } } else { info!( @@ -205,15 +232,25 @@ pub fn run( } //go through input file again? Means i can grab 'words' from the file. Could do something like just checking candidates - + let clone_map = detect_clones( + &token_rankings, + &vector_of_indices, + threshold, + &function_paths_and_lengths, + )?; + info!( + "Finished detecting clones. {} unique files found.", + clone_map.len() + ); Ok(()) } fn index_builder( input_file: &DataFrame, - token_rankings: HashMap, (usize, usize)>, + token_rankings: &HashMap, (usize, usize)>, p_prefix: usize, threshold: f64, + function_paths_and_lengths: &HashMap, ) -> Result<(Vec, (usize, usize))> { let word_matcher: Matcher = Matcher::words_matcher(); @@ -243,8 +280,12 @@ fn index_builder( .unwrap_or(usize::MAX), ) }); - let codeblock_length = vectored_bow.iter().map(|(_, count)| count).sum::(); - + //let codeblock_length = vectored_bow.iter().map(|(_, count)| count).sum::(); + let codeblock_length = function_paths_and_lengths + .get(&blake3::hash(path.as_bytes())) + .map(|(_, count)| *count) + .unwrap_or(0); + // Could probably rewrite this to get number of words from the input file instead of calculating it here // Min and Max codeblock are in number of words, not tokens but seeing as they're only used for estimating verification cost they don't need to be precise if codeblock_length < min_words { min_words = codeblock_length; @@ -252,9 +293,6 @@ fn index_builder( if codeblock_length > max_words { max_words = codeblock_length; } - let _verification_cost_per_candidate_estimate = - (min_words as f64 + max_words as f64) / 2.0; - //Temporarily shut off so the compiler doesn't complain about unused variables, will be used later let prefix_length = codeblock_length - ((codeblock_length as f64) * threshold).round() as usize + 1; @@ -263,9 +301,9 @@ fn index_builder( let function_id = blake3::hash(path.as_bytes()); //info!("Prefix length: {}, total tokens: {}, codeblock length: {}", prefix_length, vectored_bow.len(), codeblock_length); - for (token, count) in vectored_bow { + for (idx, (token, count)) in vectored_bow.iter().enumerate() { cumulative_count += count; - vector_of_indices[p - 1].add(&token, count, function_id); + vector_of_indices[p - 1].add(token, function_id, *count, idx, cumulative_count); if cumulative_count >= prefix_length { if p == p_prefix { //info!("Prefix scheme {} added token {} with count {}", p, String::from_utf8_lossy(&token), count); @@ -288,37 +326,301 @@ fn index_builder( Ok((vector_of_indices, (min_words, max_words))) } -/* fn delta_filter_cost( - prefix_vector: &Vec<(Vec, usize)>, - vector_of_indices: &Vec, +fn delta_filter_cost( + token_tuple: &(Vec, usize), + vector_of_indices: &[InvertedIndex], //changed from &Vec since the compiler requested it p_prefix: usize, - &previous_cost: &usize, + new: bool, ) -> usize { + let token = &token_tuple.0; let mut cost = 0; - if p_prefix == 1 { - for (token, _) in prefix_vector { - cost += vector_of_indices[0].token_frequency(token, false); + if new { + //if the token is new to the prefix, we need to count its frequency in all previous delta indices + for p in 1..=p_prefix { + cost += vector_of_indices[p - 1].token_frequency(token, false); } } else { - let last_token = prefix_vector.last().unwrap().0.clone(); - for (token, _) in prefix_vector { - cost += vector_of_indices[p_prefix - 1].token_frequency(token, false); + //just count the frequency in the new delta index + cost += vector_of_indices[p_prefix - 1].token_frequency(token, false); + } + cost +} + +fn weighted_prefix_end(vectored_bow: &[(Vec, usize)], prefix_length: usize) -> usize { + if prefix_length == 0 { + info!("Prefix length is 0, returning 0 for weighted prefix end."); + // This case shouldn't be seen + return 0; + } + let mut cumulative_count = 0usize; + for (idx, (_, count)) in vectored_bow.iter().enumerate() { + cumulative_count += *count; + if cumulative_count >= prefix_length { + return idx + 1; //Enumerator is 0-based, so we need to add 1 to get the correct length of the prefix vector } - for p in 1..(p_prefix - 1) { - // the previous for-loop already counted the last token for the current inverted_index - cost += vector_of_indices[p - 1].token_frequency(&last_token, false); + } + info!("Warning: prefix_length {} is greater than total token count {}, returning full length of vectored_bow.", prefix_length, vectored_bow.len()); + vectored_bow.len() +} + +fn detect_clones( + token_rankings: &HashMap, (usize, usize)>, + vector_of_indices: &[InvertedIndex], //changed from &Vec since the compiler requested it + threshold: f64, + function_paths_and_lengths: &HashMap, +) -> Result>> { + // result will probably be a 'clone-map'. Unsure for now if it has to be its own data-structure or if i can reuse the candidate map from before. + let mut clone_map: HashMap> = HashMap::new(); //key is the original function id, value is a set of clones of that function + + let word_matcher: Matcher = Matcher::words_matcher(); + let p_prefix = vector_of_indices.len(); + for (path, origin_word_count) in function_paths_and_lengths.values() { + info!("Path: {}, Words: {}", path, origin_word_count); + match load_file(path, 1024 * 1024 * 1024) { + Ok(Ok(function_code)) => { + let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase()); + let mut origin_vectored_bow = local_bow.vectorize(); + origin_vectored_bow.sort_by_key(|(token, _)| { + Reverse( + token_rankings + .get(token) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX), + ) + }); + let origin_function_id = blake3::hash(path.as_bytes()); + let mut candidate_map = CandidateMap::new(); + + let prefix_length = origin_word_count + - ((*origin_word_count as f64) * threshold).round() as usize + + 1; + + let init_prefix_end = weighted_prefix_end(&origin_vectored_bow, prefix_length); + let mut filter_cost_vector: Vec = Vec::new(); + filter_cost_vector.push(0); //cost of prefix scheme 1 is calculated from an empty prefix, so the initial cost is 0 + let mut verification_cost_vector: Vec = Vec::new(); + verification_cost_vector.push(0); //verification cost is estimated as 0 for the first prefix scheme since we haven't seen any candidates yet, + let mut total_cost_vector: Vec = Vec::new(); + total_cost_vector.push(usize::MAX); //total cost is initially set to max since so 0-prefix can never be chosen as the best prefix scheme + // big loop, will be used for the different prefix schemes + let mut origin_cumulative_count = 0usize; + 'prefix_schemes: for p in 1..=p_prefix { + let mut filter_cost = filter_cost_vector[p - 1]; // start with the filter cost of the previous prefix scheme + let prefix_end = init_prefix_end + p - 1; //the prefix end for the current scheme is at least the prefix end of the first scheme + the number of tokens in the prefix - 1 (since p-prefix is at least 1) + + for (idx, token_tuple) in + origin_vectored_bow.iter().take(prefix_end).enumerate() + { + //loop through the prefix vector of the current scheme, for the first scheme this is just the original prefix vector, for the next schemes this includes additional tokens + let is_new = idx + 1 == prefix_end; + origin_cumulative_count += token_tuple.1; + filter_cost += delta_filter_cost(token_tuple, vector_of_indices, p, is_new); + for candidate in vector_of_indices[p - 1] + .get(&token_tuple.0) + .unwrap_or(&Vec::new()) + { + let candidate_word_count = function_paths_and_lengths + .get(&candidate.0) + .map(|(_, count)| *count) + .unwrap_or(0); + + if candidate_word_count + > ((*origin_word_count as f64) * threshold).round() as usize + { + let new_matches = min(token_tuple.1, candidate.1); + let function_id = candidate.0; + let last_token_seen_pos = candidate.2; // (token_position, cumulative_count) + let current_threshold = + (max(*origin_word_count, candidate_word_count) as f64 + * threshold) + .round() as usize; + let upper_bound = min( + *origin_word_count - origin_cumulative_count, + candidate_word_count - last_token_seen_pos.1 + new_matches, //candidate.2.1 is the number of words seen up to and including this token including duplicates + ); + if candidate_map.get_token_matches(&function_id) + upper_bound + >= current_threshold + { + candidate_map.add_pending_update( + function_id, + new_matches, + last_token_seen_pos, + ); + } + } + } + filter_cost_vector.push(filter_cost); + verification_cost_vector.push(candidate_map.verification_cost_estimate(p)); + total_cost_vector.push(filter_cost + verification_cost_vector[p]); + + if total_cost_vector[p] > total_cost_vector[p - 1] { + info!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]); + //return verify_candidates(candidate_map, path, function_code, p - 1); //Need to keep in mind if candidate map is already updated with new prefix scheme + verify_candidates( + origin_function_id, + &origin_vectored_bow, + (idx, origin_cumulative_count), + &mut candidate_map, + &mut clone_map, + p_prefix, + token_rankings, + threshold, + function_paths_and_lengths, + )?; + break 'prefix_schemes; + } else { + //apply updates + candidate_map.apply_pending_updates(function_paths_and_lengths); + if p == p_prefix { + //return verify_candidates(candidate_map, path, function_code, p); + verify_candidates( + origin_function_id, + &origin_vectored_bow, + (idx, origin_cumulative_count), + &mut candidate_map, + &mut clone_map, + p_prefix, + token_rankings, + threshold, + function_paths_and_lengths, + )?; + break 'prefix_schemes; + } + } + } + } + } + Ok(Err(_e)) => { + info!("Warning: File too large at path '{}', skipping.", path); + } + Err(_e) => { + info!("Failed to read file at path '{}', skipping.", path); + } } } - let total_cost = previous_cost + cost; - total_cost -} */ -/* fn clone_detection( - input_file: &DataFrame, - token_rankings: HashMap, (usize, usize)>, - vector_of_indices: &Vec, + Ok(clone_map) +} + +fn verify_candidates( + origin_function_id: blake3::Hash, + origin_vectored_bow: &Vec<(Vec, usize)>, + origin_last_token_seen_pos: (usize, usize), + candidate_map: &mut CandidateMap, + clone_map: &mut HashMap>, + p_prefix: usize, + token_rankings: &HashMap, (usize, usize)>, threshold: f64, -) -> Result<()> { // result will probably be a 'clone-map'. Unsure for now if it has to be its own data-structure or if i can reuse the candidate map from before. - // This is where the actual clone detection happens, currently not implemented + function_paths_and_lengths: &HashMap, +) -> Result<()> { + // This function will take the candidate map for a function and verify the candidates that have enough matches + // to be considered clones based on their full token vectors. + // The clone_map is updated with the results, mapping original function ids to sets of clone function ids. + let word_matcher: Matcher = Matcher::words_matcher(); + let origin_word_count = function_paths_and_lengths + .get(&origin_function_id) + .map(|(_, count)| *count) + .unwrap_or(0); + let origin_vectored_bow = origin_vectored_bow.to_owned(); + let origin_token_count = origin_vectored_bow.len(); + let candidates_to_verify = candidate_map.get_candidates_with_n_matches(p_prefix, "at_least"); + let mut origin_last_token_seen_pos = origin_last_token_seen_pos; // (token_position, cumulative_count) + for candidate_id in candidates_to_verify { + let (path, length) = function_paths_and_lengths + .get(&candidate_id) + .copied() + .unwrap(); + match load_file(path, 1024 * 1024 * 1024) { + Ok(Ok(candidate_code)) => { + // Handle successful file load + // load function, sort tokens by global frequency, calculate similarity, if above threshold add to clone map + let candidate_bow = word_matcher.bag_of_words(&candidate_code.to_ascii_lowercase()); + let mut vectored_candidate_bow = candidate_bow.vectorize(); + vectored_candidate_bow.sort_by_key(|(token, _)| { + Reverse( + token_rankings + .get(token) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX), + ) + }); + let candidate_word_count = length; + let candidate_token_count = vectored_candidate_bow.len(); + let current_threshold = (max(origin_word_count, candidate_word_count) as f64 + * threshold) + .round() as usize; + let mut candidate_last_token_seen_pos = + candidate_map.get_last_token_seen_pos(&candidate_id); // (token_position, cumulative_count) + let mut new_matches = 0usize; + while origin_last_token_seen_pos.0 < origin_token_count + && candidate_last_token_seen_pos.0 < candidate_token_count + { + if min( + origin_token_count - origin_last_token_seen_pos.1, + candidate_token_count - candidate_last_token_seen_pos.1, + ) > current_threshold + { + let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0]; + let candidate_token_tuple = + &vectored_candidate_bow[candidate_last_token_seen_pos.0]; + if origin_token_tuple.0 == candidate_token_tuple.0 { + //it's a match + new_matches += min(origin_token_tuple.1, candidate_token_tuple.1); + candidate_last_token_seen_pos.0 += 1; + candidate_last_token_seen_pos.1 += candidate_token_tuple.1; + origin_last_token_seen_pos.0 += 1; + origin_last_token_seen_pos.1 += origin_token_tuple.1; + } else if token_rankings + .get(&origin_token_tuple.0) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX) + < token_rankings + .get(&candidate_token_tuple.0) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX) + { + //origin token is more frequent than candidate token, so we move in the origin vector + origin_last_token_seen_pos.0 += 1; + origin_last_token_seen_pos.1 += origin_token_tuple.1; + } else { + //candidate token is more frequent than origin token, so we move in the candidate vector + candidate_last_token_seen_pos.0 += 1; + candidate_last_token_seen_pos.1 += candidate_token_tuple.1; + } + } else { + //the upper bound of the remaining matches is not enough to reach the threshold, so we can stop comparing this candidate + break; + } + } + candidate_map.add_candidate( + candidate_id, + function_paths_and_lengths, + new_matches, + candidate_last_token_seen_pos, + ); + if candidate_map.get_token_matches(&candidate_id) >= current_threshold { + //add to clone map + clone_map + .entry(origin_function_id) + .or_default() + .insert(candidate_id); + info!( + "Clone detected! Original: {}, Candidate: {}, Similarity: {:.2} %", + origin_function_id, + candidate_id, + (candidate_map.get_token_matches(&candidate_id) as f64 + / max(origin_word_count, candidate_word_count) as f64) + * 100.0 + ); + } + } + Ok(Err(_)) => { + info!("Warning: File too large at path '{}', skipping.", path); + } + Err(_) => { + info!("Failed to read file at path '{}', skipping.", path); + } + } + } Ok(()) -} */ +} diff --git a/src/utils/candidate_map.rs b/src/utils/candidate_map.rs index 995ba30..32bb9eb 100644 --- a/src/utils/candidate_map.rs +++ b/src/utils/candidate_map.rs @@ -1,14 +1,18 @@ +use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; pub struct CandidateEntry { pub matches: usize, pub length: usize, - pub last_token_seen_pos: usize, + pub last_token_seen_pos: (usize, usize), // (token_position, cumulative_count) } pub struct CandidateMap { entries: HashMap, match_histogram: HashMap>, + pending_updates: Vec<(blake3::Hash, usize, (usize, usize))>, // (function_id, new_matches, last_token_seen_pos) + min_length: usize, + max_length: usize, } impl Default for CandidateMap { @@ -22,21 +26,68 @@ impl CandidateMap { Self { entries: HashMap::new(), match_histogram: HashMap::new(), + min_length: usize::MAX, + max_length: 0, + pending_updates: Vec::new(), + } + } + + pub fn get_token_matches(&self, function_id: &blake3::Hash) -> usize { + self.entries + .get(function_id) + .map(|entry| entry.matches) + .unwrap_or(0) + } + + pub fn add_pending_update( + &mut self, + function_id: blake3::Hash, + new_matches: usize, + last_token_seen_pos: (usize, usize), + ) { + self.pending_updates + .push((function_id, new_matches, last_token_seen_pos)); + } + + pub fn apply_pending_updates( + &mut self, + function_paths_and_lengths: &HashMap, + ) { + let updates = self.pending_updates.drain(..).collect::>(); + for (function_id, new_matches, last_token_seen_pos) in updates { + self.add_candidate( + function_id, + function_paths_and_lengths, + new_matches, + last_token_seen_pos, + ); } } pub fn add_candidate( &mut self, function_id: blake3::Hash, - length: usize, + function_paths_and_lengths: &std::collections::HashMap, new_matches: usize, - last_token_seen_pos: usize, + last_token_seen_pos: (usize, usize), ) { - let entry = self.entries.entry(function_id).or_insert(CandidateEntry { - matches: 0, - length, - last_token_seen_pos, - }); + let entry = match self.entries.entry(function_id) { + Entry::Occupied(occupied) => occupied.into_mut(), + Entry::Vacant(vacant) => { + let length = function_paths_and_lengths + .get(&function_id) + .map(|(_, count)| *count) + .unwrap_or(0); + let last_token_seen_pos = (0, 0); // Initialize to (0, 0) for new candidates + self.min_length = self.min_length.min(length); + self.max_length = self.max_length.max(length); + vacant.insert(CandidateEntry { + matches: 0, + length, + last_token_seen_pos, + }) + } + }; // Update the match histogram if entry.matches > 0 { @@ -46,15 +97,52 @@ impl CandidateMap { } entry.matches += new_matches; - entry.length = length; entry.last_token_seen_pos = last_token_seen_pos; - self.match_histogram .entry(entry.matches) .or_default() .insert(function_id); } + pub fn length_range(&self) -> Option<(usize, usize)> { + if self.entries.is_empty() { + None + } else { + Some((self.min_length, self.max_length)) + } + } + + pub fn get_candidates_with_n_matches(&self, n: usize, mode: &str) -> HashSet { + if mode == "exact" { + self.match_histogram.get(&n).cloned().unwrap_or_default() + } else if mode == "at_least" { + self.match_histogram + .iter() + .filter(|(&matches, _)| matches >= n) + .flat_map(|(_, bucket)| bucket.clone()) + .collect() + } else { + panic!("Invalid mode: {}", mode); + } + } + + pub fn get_last_token_seen_pos(&self, function_id: &blake3::Hash) -> (usize, usize) { + self.entries + .get(function_id) + .map(|entry| entry.last_token_seen_pos) + .unwrap_or((0, 0)) + } + + pub fn update_last_token_seen_pos( + &mut self, + function_id: &blake3::Hash, + new_pos: (usize, usize), + ) { + if let Some(entry) = self.entries.get_mut(function_id) { + entry.last_token_seen_pos = new_pos; + } + } + pub fn count_candidates_with_n_matches(&self, n: usize, mode: &str) -> usize { if mode == "exact" { self.match_histogram @@ -71,4 +159,29 @@ impl CandidateMap { panic!("Invalid mode: {}", mode); } } + + pub fn verification_cost_estimate(&self, n: usize) -> usize { + let mut number_of_candidates = self.count_candidates_with_n_matches(n, "at_least"); //the candidates that have already reached n matches + + let mut survivors = 0usize; + for candidate in &self.pending_updates { + let function_id = candidate.0; + let current_matches = self.get_token_matches(&function_id); + if current_matches == n - 1 { + survivors += 1; + } + } + number_of_candidates += survivors; //add the candidates that are about to reach n matches + // I am disregarding the candidates with less than n-1 matches that will also reach n_matches due to new_matches>1 + // But as I understand it they should always satisfy property 1 + // A candidate doesn't get to come back after being eliminated once + // Also it's a very rare edge case + let length_range = self.length_range().unwrap_or((usize::MAX, 0)); + let average_length = if length_range.0 == usize::MAX { + 0 + } else { + (length_range.0 + length_range.1) / 2 + }; + number_of_candidates * average_length + } } diff --git a/src/utils/inverted_index.rs b/src/utils/inverted_index.rs index a976fd3..5be6334 100644 --- a/src/utils/inverted_index.rs +++ b/src/utils/inverted_index.rs @@ -1,7 +1,7 @@ use blake3::Hash; use std::collections::HashMap; pub struct InvertedIndex { - map: HashMap, Vec<(Hash, usize)>>, // Maps tokens to a list of function IDs where they appear as well as the frequency of the token in that function + map: HashMap, Vec<(Hash, usize, (usize, usize))>>, // token -> Vec<(function_id, count, (token_position, cumulative_count))> } impl Default for InvertedIndex { @@ -17,14 +17,24 @@ impl InvertedIndex { } } - pub fn add(&mut self, token: &Vec, count: usize, function_id: Hash) { - self.map - .entry(token.to_owned()) - .or_default() - .push((function_id, count)); + pub fn add( + &mut self, + token: &Vec, + function_id: Hash, + count: usize, + token_position: usize, + cumulative_count: usize, + ) { + //token_position is the index of the token. + // cumulative_count is the number of words seen up to and including this token including duplicates + self.map.entry(token.to_owned()).or_default().push(( + function_id, + count, + (token_position, cumulative_count), + )); } - pub fn get(&self, token: &Vec) -> Option<&Vec<(Hash, usize)>> { + pub fn get(&self, token: &Vec) -> Option<&Vec<(Hash, usize, (usize, usize))>> { self.map.get(token) } @@ -43,7 +53,7 @@ impl InvertedIndex { pub fn token_frequency(&self, token: &Vec, count_duplicates: bool) -> usize { if let Some(functions) = self.get(token) { if count_duplicates { - functions.iter().map(|(_, count)| *count).sum() + functions.iter().map(|(_, count, _)| *count).sum() } else { functions.len() } From 710fd6777eaded4347de49b5bf94a534c8ec59a2 Mon Sep 17 00:00:00 2001 From: swartling Date: Wed, 8 Apr 2026 13:48:05 +0200 Subject: [PATCH 09/14] fixed cost estimation logic --- src/phases/type_3_duplicate_files.rs | 74 +++++++++++++++++----------- src/utils/candidate_map.rs | 3 +- 2 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs index 4ea31b9..75f37cf 100644 --- a/src/phases/type_3_duplicate_files.rs +++ b/src/phases/type_3_duplicate_files.rs @@ -239,7 +239,7 @@ pub fn run( &function_paths_and_lengths, )?; info!( - "Finished detecting clones. {} unique files found.", + "Finished detecting clones. {} clones found.", clone_map.len() ); Ok(()) @@ -375,7 +375,8 @@ fn detect_clones( let word_matcher: Matcher = Matcher::words_matcher(); let p_prefix = vector_of_indices.len(); for (path, origin_word_count) in function_paths_and_lengths.values() { - info!("Path: {}, Words: {}", path, origin_word_count); + info!("-----------------------------------------------------------------------------"); + // info!("Path: {}, Words: {}", path, origin_word_count); match load_file(path, 1024 * 1024 * 1024) { Ok(Ok(function_code)) => { let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase()); @@ -404,15 +405,19 @@ fn detect_clones( total_cost_vector.push(usize::MAX); //total cost is initially set to max since so 0-prefix can never be chosen as the best prefix scheme // big loop, will be used for the different prefix schemes let mut origin_cumulative_count = 0usize; + let mut origin_token_position = 0usize; 'prefix_schemes: for p in 1..=p_prefix { let mut filter_cost = filter_cost_vector[p - 1]; // start with the filter cost of the previous prefix scheme let prefix_end = init_prefix_end + p - 1; //the prefix end for the current scheme is at least the prefix end of the first scheme + the number of tokens in the prefix - 1 (since p-prefix is at least 1) - for (idx, token_tuple) in - origin_vectored_bow.iter().take(prefix_end).enumerate() + /* for (idx, token_tuple) in + origin_vectored_bow.iter().take(prefix_end).enumerate() */ + while origin_token_position < prefix_end + && origin_token_position < origin_vectored_bow.len() { + let token_tuple = origin_vectored_bow.get(origin_token_position).unwrap(); //loop through the prefix vector of the current scheme, for the first scheme this is just the original prefix vector, for the next schemes this includes additional tokens - let is_new = idx + 1 == prefix_end; + let is_new = origin_token_position + 1 == prefix_end; origin_cumulative_count += token_tuple.1; filter_cost += delta_filter_cost(token_tuple, vector_of_indices, p, is_new); for candidate in vector_of_indices[p - 1] @@ -449,17 +454,41 @@ fn detect_clones( } } } - filter_cost_vector.push(filter_cost); - verification_cost_vector.push(candidate_map.verification_cost_estimate(p)); - total_cost_vector.push(filter_cost + verification_cost_vector[p]); + origin_token_position += 1; + } + if p == 1 { + candidate_map.apply_pending_updates(function_paths_and_lengths); + //apply updates for the first prefix scheme before estimating costs since it relies on min/max length + } + let verification_cost = candidate_map.verification_cost_estimate(p); + filter_cost_vector.push(filter_cost); + verification_cost_vector.push(verification_cost); + total_cost_vector.push(filter_cost + verification_cost); - if total_cost_vector[p] > total_cost_vector[p - 1] { - info!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]); - //return verify_candidates(candidate_map, path, function_code, p - 1); //Need to keep in mind if candidate map is already updated with new prefix scheme + if total_cost_vector[p] > total_cost_vector[p - 1] { + info!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]); + info!("The next prefix scheme {} has estimated total cost of {}, filter cost: {}, verification cost: {}.", p, total_cost_vector[p], filter_cost_vector[p], verification_cost_vector[p]); + verify_candidates( + origin_function_id, + &origin_vectored_bow, + (origin_token_position, origin_cumulative_count), + &mut candidate_map, + &mut clone_map, + p_prefix, + token_rankings, + threshold, + function_paths_and_lengths, + )?; + break 'prefix_schemes; + } else { + //apply updates + candidate_map.apply_pending_updates(function_paths_and_lengths); + if p == p_prefix { + //return verify_candidates(candidate_map, path, function_code, p); verify_candidates( origin_function_id, &origin_vectored_bow, - (idx, origin_cumulative_count), + (origin_token_position, origin_cumulative_count), &mut candidate_map, &mut clone_map, p_prefix, @@ -468,27 +497,12 @@ fn detect_clones( function_paths_and_lengths, )?; break 'prefix_schemes; - } else { - //apply updates - candidate_map.apply_pending_updates(function_paths_and_lengths); - if p == p_prefix { - //return verify_candidates(candidate_map, path, function_code, p); - verify_candidates( - origin_function_id, - &origin_vectored_bow, - (idx, origin_cumulative_count), - &mut candidate_map, - &mut clone_map, - p_prefix, - token_rankings, - threshold, - function_paths_and_lengths, - )?; - break 'prefix_schemes; - } } } } + info!("Filter cost vector: {:?}", filter_cost_vector); + info!("Verification cost vector: {:?}", verification_cost_vector); + info!("Total cost vector: {:?}", total_cost_vector); } Ok(Err(_e)) => { info!("Warning: File too large at path '{}', skipping.", path); diff --git a/src/utils/candidate_map.rs b/src/utils/candidate_map.rs index 32bb9eb..52bee53 100644 --- a/src/utils/candidate_map.rs +++ b/src/utils/candidate_map.rs @@ -167,7 +167,8 @@ impl CandidateMap { for candidate in &self.pending_updates { let function_id = candidate.0; let current_matches = self.get_token_matches(&function_id); - if current_matches == n - 1 { + if n > 1 && current_matches == n - 1 { + // if n==1 the pending list is empty as they have already been applied survivors += 1; } } From 72fe49812acae2fdb45dbee3abc9e5c990b5e139 Mon Sep 17 00:00:00 2001 From: swartling Date: Sun, 12 Apr 2026 13:59:33 +0200 Subject: [PATCH 10/14] temp for migrating --- src/bin/main.rs | 1 + src/phases/parse.rs | 2 + src/phases/type_3_duplicate_files.rs | 75 +++++++++++++++---- .../type_3_duplicate_files/files/original.py | 6 ++ .../files/original.py.functions/1-1 | 6 ++ .../type_3_duplicate_files/files/type_1.py | 11 +++ .../files/type_1.py.functions/1-1 | 11 +++ .../type_3_duplicate_files/files/type_2.py | 12 +++ .../files/type_2.py.functions/1-1 | 12 +++ .../type_3_duplicate_files/files/type_3.py | 11 +++ .../files/type_3.py.functions/1-1 | 11 +++ .../type_3_duplicate_files/parser_log.csv | 5 ++ .../phases/type_3_duplicate_files/python.json | 12 +++ .../test_parser_input.csv | 5 ++ .../test_parser_output.functions.csv | 5 ++ 15 files changed, 171 insertions(+), 14 deletions(-) create mode 100644 tests/data/phases/type_3_duplicate_files/files/original.py create mode 100644 tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_1.py create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_2.py create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1 create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_3.py create mode 100644 tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1 create mode 100644 tests/data/phases/type_3_duplicate_files/parser_log.csv create mode 100644 tests/data/phases/type_3_duplicate_files/python.json create mode 100644 tests/data/phases/type_3_duplicate_files/test_parser_input.csv create mode 100644 tests/data/phases/type_3_duplicate_files/test_parser_output.functions.csv diff --git a/src/bin/main.rs b/src/bin/main.rs index 6d5a78c..715e788 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -267,6 +267,7 @@ fn main() { cli_subargs.get_one::("map").map(|x| x.as_str()), cli_subargs.get_one::("logs").map(|x| x.as_str()), /* languages */ + cli_subargs.get_one::("language").map(|s| s.as_str()), *cli_subargs.get_one::("threads").unwrap(), *cli_subargs.get_one::("p_prefix").unwrap(), *cli_subargs.get_one::("threshold").unwrap(), diff --git a/src/phases/parse.rs b/src/phases/parse.rs index 8ed9661..cabaf25 100644 --- a/src/phases/parse.rs +++ b/src/phases/parse.rs @@ -455,6 +455,7 @@ fn analyze_file( ignore_comments: bool, word_counter: &Matcher, ) -> Result<(String, Option)> { + info!("analyze_file called with path: {path}"); let grammar = language_to_grammar(language) .with_context(|| format!("Unsupported language: {language}"))?; // Initializes the parser @@ -462,6 +463,7 @@ fn analyze_file( parser.set_language(&grammar.lang)?; match load_file(path, 1024 * 1024 * 1024)? { Ok(source_code) => { + info!("File {path} loaded successfully"); // Creates a folder to store the functions of the file let target_folder: String = format!("{path}.functions"); create_dir(&target_folder)?; diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs index 75f37cf..8a95635 100644 --- a/src/phases/type_3_duplicate_files.rs +++ b/src/phases/type_3_duplicate_files.rs @@ -47,10 +47,11 @@ pub fn cli() -> Command { .required(false), ) .arg( - Arg::new("languages") + Arg::new("language") .short('g') .long("languages") - .help("Comma-separated list of languages to consider. If not provided, all languages will be considered.") + /* .help("Comma-separated list of languages to consider. If not provided, all languages will be considered.") */ + .help("language as a string, e.g. 'java'. If not provided, defaults to 'java'. TODO") .required(false), ) .arg( @@ -91,13 +92,15 @@ pub fn run( _map_path: Option<&str>, //optional path to the map CSV file to store the mapping of clones to their originals. _logs_path: Option<&str>, //for error logs, not implemented yet /* _opt_languages: Option>, //optional list of languages. Currently java is hardcoded */ + opt_language: Option<&str>, _threads: usize, //current implementation is single-threaded p_prefix: usize, //number of tokens to consider for the prefix, default is 1 threshold: f64, //threshold for the prefix length, default is 0.8 example_word: Option<&String>, //an example word to check the global Bag of Words for, optional _logger: &Logger, ) -> Result<()> { - let language = "java"; + //let language = "java"; + let language = opt_language.unwrap_or("java"); //default to java currently let minimum_loc = 5; //temporary let mut input_file = open_csv( input_path, @@ -531,10 +534,14 @@ fn verify_candidates( // to be considered clones based on their full token vectors. // The clone_map is updated with the results, mapping original function ids to sets of clone function ids. let word_matcher: Matcher = Matcher::words_matcher(); - let origin_word_count = function_paths_and_lengths + let (origin_path, origin_word_count) = function_paths_and_lengths .get(&origin_function_id) - .map(|(_, count)| *count) - .unwrap_or(0); + .copied() + .unwrap_or(("Unknown", 0)); + info!( + "Verifying candidates for function at path '{}', with word count {}.", + origin_path, origin_word_count + ); let origin_vectored_bow = origin_vectored_bow.to_owned(); let origin_token_count = origin_vectored_bow.len(); let candidates_to_verify = candidate_map.get_candidates_with_n_matches(p_prefix, "at_least"); @@ -544,6 +551,9 @@ fn verify_candidates( .get(&candidate_id) .copied() .unwrap(); + if candidate_id == origin_function_id { + continue; //skip comparing the function to itself + } match load_file(path, 1024 * 1024 * 1024) { Ok(Ok(candidate_code)) => { // Handle successful file load @@ -563,22 +573,31 @@ fn verify_candidates( let current_threshold = (max(origin_word_count, candidate_word_count) as f64 * threshold) .round() as usize; + info!("Current threshold: {}", current_threshold); let mut candidate_last_token_seen_pos = candidate_map.get_last_token_seen_pos(&candidate_id); // (token_position, cumulative_count) let mut new_matches = 0usize; while origin_last_token_seen_pos.0 < origin_token_count && candidate_last_token_seen_pos.0 < candidate_token_count { - if min( - origin_token_count - origin_last_token_seen_pos.1, - candidate_token_count - candidate_last_token_seen_pos.1, - ) > current_threshold - { + let upper_bound = min( + origin_word_count - origin_last_token_seen_pos.1, + candidate_word_count - candidate_last_token_seen_pos.1, + ) + candidate_map.get_token_matches(&candidate_id); + + if upper_bound > current_threshold { + info!("IF MIN"); let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0]; let candidate_token_tuple = &vectored_candidate_bow[candidate_last_token_seen_pos.0]; if origin_token_tuple.0 == candidate_token_tuple.0 { //it's a match + info!("MATCHING!"); + info!( + "MATCH! origin: {}, candidate: {}", + String::from_utf8_lossy(&origin_token_tuple.0), + String::from_utf8_lossy(&candidate_token_tuple.0) + ); new_matches += min(origin_token_tuple.1, candidate_token_tuple.1); candidate_last_token_seen_pos.0 += 1; candidate_last_token_seen_pos.1 += candidate_token_tuple.1; @@ -594,15 +613,41 @@ fn verify_candidates( .unwrap_or(usize::MAX) { //origin token is more frequent than candidate token, so we move in the origin vector + info!( + "origin_count > candidate_count: origin: {}, candidate: {}", + String::from_utf8_lossy(&origin_token_tuple.0), + String::from_utf8_lossy(&candidate_token_tuple.0) + ); origin_last_token_seen_pos.0 += 1; origin_last_token_seen_pos.1 += origin_token_tuple.1; } else { //candidate token is more frequent than origin token, so we move in the candidate vector + info!( + "candidate_count > origin_count: origin: {}, candidate: {}", + String::from_utf8_lossy(&origin_token_tuple.0), + String::from_utf8_lossy(&candidate_token_tuple.0) + ); candidate_last_token_seen_pos.0 += 1; candidate_last_token_seen_pos.1 += candidate_token_tuple.1; } } else { //the upper bound of the remaining matches is not enough to reach the threshold, so we can stop comparing this candidate + /* info!("UPPER BOUND of remaining matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", min(origin_word_count - origin_last_token_seen_pos.1, candidate_word_count - candidate_last_token_seen_pos.1), current_threshold); + info!( + "Current matches: {}, new matches: {}, total possible matches: {}", + candidate_map.get_token_matches(&candidate_id), + new_matches, + candidate_map.get_token_matches(&candidate_id) + + min( + origin_word_count - origin_last_token_seen_pos.1, + candidate_word_count - candidate_last_token_seen_pos.1 + ) + ); */ + info!("UPPER BOUND of remaining matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound, current_threshold); + info!( + "origin_last_token_seen_pos: {}, candidate_last_token_seen_pos: {}", + origin_last_token_seen_pos.0, candidate_last_token_seen_pos.0 + ); break; } } @@ -619,9 +664,11 @@ fn verify_candidates( .or_default() .insert(candidate_id); info!( - "Clone detected! Original: {}, Candidate: {}, Similarity: {:.2} %", - origin_function_id, - candidate_id, + "Clone detected! Candidate: {}, Similarity: {:.2} %", + function_paths_and_lengths + .get(&candidate_id) + .map(|(path, _)| *path) + .unwrap_or("Unknown"), (candidate_map.get_token_matches(&candidate_id) as f64 / max(origin_word_count, candidate_word_count) as f64) * 100.0 diff --git a/tests/data/phases/type_3_duplicate_files/files/original.py b/tests/data/phases/type_3_duplicate_files/files/original.py new file mode 100644 index 0000000..cec103e --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/original.py @@ -0,0 +1,6 @@ +def code_block_1(): + example_array = ["telephone", "dog", "example", "banana", "apple"] + # This is the original code block the others are compared to. + for word in example_array: + print(f"Word: {word}") + print("Done") \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1 b/tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1 new file mode 100644 index 0000000..6dbf785 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1 @@ -0,0 +1,6 @@ +def code_block_1(): + example_array = ["telephone", "dog", "example", "banana", "apple"] + + for word in example_array: + print(f"Word: {word}") + print("Done") \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/type_1.py b/tests/data/phases/type_3_duplicate_files/files/type_1.py new file mode 100644 index 0000000..ba84af1 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/type_1.py @@ -0,0 +1,11 @@ +def code_block_1(): + # Type-1 Clone + # This code block is identical to the original code block with some exceptions + # The comment is different and it is formatted differently. + example_array = ["telephone", "dog", "example", "banana", "apple"] + + for word in example_array: + + print(f"Word: {word}") + + print("Done") \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1 b/tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1 new file mode 100644 index 0000000..8ba21b7 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1 @@ -0,0 +1,11 @@ +def code_block_1(): + + + + example_array = ["telephone", "dog", "example", "banana", "apple"] + + for word in example_array: + + print(f"Word: {word}") + + print("Done") \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/type_2.py b/tests/data/phases/type_3_duplicate_files/files/type_2.py new file mode 100644 index 0000000..91eda16 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/type_2.py @@ -0,0 +1,12 @@ +def code_block_2(): + # Type-2 Clone + # This code block is identical to the original code block with some exceptions + # In addition to the type-1 changes, the variable name are changed and the function name is changed. + # One of the literal values has also changed. "telephone" -> "computer". + my_array = ["computer", "dog", "example", "banana", "apple"] + + for item in my_array: + + print(f"Word: {item}") + + print("Done") \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1 b/tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1 new file mode 100644 index 0000000..03556b6 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1 @@ -0,0 +1,12 @@ +def code_block_2(): + + + + + my_array = ["computer", "dog", "example", "banana", "apple"] + + for item in my_array: + + print(f"Word: {item}") + + print("Done") \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/type_3.py b/tests/data/phases/type_3_duplicate_files/files/type_3.py new file mode 100644 index 0000000..74098d4 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/type_3.py @@ -0,0 +1,11 @@ +def code_block_3(): + # Type-3 Clone + # This code block is similar to the original code block but with some differences. + # In addition to the type-2 changes, it differs at the statement level. + # An 'append' statement has been added and the print statement at the end has been removed. + my_array = ["computer", "dog", "example", "banana", "apple"] + my_array.append("grape") + + for item in my_array: + + print(f"Word: {item}") \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1 b/tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1 new file mode 100644 index 0000000..c264039 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1 @@ -0,0 +1,11 @@ +def code_block_3(): + + + + + my_array = ["computer", "dog", "example", "banana", "apple"] + my_array.append("grape") + + for item in my_array: + + print(f"Word: {item}") \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/parser_log.csv b/tests/data/phases/type_3_duplicate_files/parser_log.csv new file mode 100644 index 0000000..872d26f --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/parser_log.csv @@ -0,0 +1,5 @@ +id,name,language,functions,functions_with_kw,tests/data/phases/type_3_duplicate_files/python.json,parse_error +1,tests/data/phases/type_3_duplicate_files/files/type_1.py,python,1,1,1,none +2,tests/data/phases/type_3_duplicate_files/files/type_2.py,python,1,1,1,none +0,tests/data/phases/type_3_duplicate_files/files/original.py,python,1,1,1,none +3,tests/data/phases/type_3_duplicate_files/files/type_3.py,python,1,1,1,none diff --git a/tests/data/phases/type_3_duplicate_files/python.json b/tests/data/phases/type_3_duplicate_files/python.json new file mode 100644 index 0000000..2be199d --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/python.json @@ -0,0 +1,12 @@ +{ + "languages": [ + { + "name": "python", + "extensions" : [ + "py" + ], + "keywords" : [] + } + ], + "keywords": [] +} \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/test_parser_input.csv b/tests/data/phases/type_3_duplicate_files/test_parser_input.csv new file mode 100644 index 0000000..0c10ab7 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/test_parser_input.csv @@ -0,0 +1,5 @@ +id,name,language +0,tests/data/phases/type_3_duplicate_files/files/original.py,python +1,tests/data/phases/type_3_duplicate_files/files/type_1.py,python +2,tests/data/phases/type_3_duplicate_files/files/type_2.py,python +3,tests/data/phases/type_3_duplicate_files/files/type_3.py,python \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/test_parser_output.functions.csv b/tests/data/phases/type_3_duplicate_files/test_parser_output.functions.csv new file mode 100644 index 0000000..6780f28 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/test_parser_output.functions.csv @@ -0,0 +1,5 @@ +id,path,name,position,language,loc,words,tests/data/phases/type_3_duplicate_files/python.json,loop_statements,loop_nestings,if_statements,if_nestings,functions_calls,function_calls_nestings,params,param_kw_match,parse_error +1,tests/data/phases/type_3_duplicate_files/files/type_1.py.functions/1-1,code_block_1,1:1,python,11,18,9,1,1,0,0,2,1,0,0,none +2,tests/data/phases/type_3_duplicate_files/files/type_2.py.functions/1-1,code_block_2,1:1,python,12,18,9,1,1,0,0,2,1,0,0,none +0,tests/data/phases/type_3_duplicate_files/files/original.py.functions/1-1,code_block_1,1:1,python,6,18,9,1,1,0,0,2,1,0,0,none +3,tests/data/phases/type_3_duplicate_files/files/type_3.py.functions/1-1,code_block_3,1:1,python,11,19,10,1,1,0,0,2,1,0,0,none From a576665b67766d4f308f97727389f8c364de25a4 Mon Sep 17 00:00:00 2001 From: swartling Date: Thu, 23 Apr 2026 13:59:58 +0200 Subject: [PATCH 11/14] switched order of token ranking --- Cargo.lock | 1 + Cargo.toml | 1 + src/phases/tokenizer.rs | 2 +- src/phases/type_3_duplicate_files.rs | 259 ++++++++++++++++----------- src/utils/bow.rs | 4 +- src/utils/candidate_map.rs | 4 +- 6 files changed, 163 insertions(+), 108 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fd1dadb..2dff1a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2807,6 +2807,7 @@ dependencies = [ "crossbeam-channel 0.5.15", "csv", "curl", + "either", "indicatif", "json", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index c7c90f9..1106684 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ crossbeam="0.7" crossbeam-channel="0.5.0" csv="1.1" curl="0.4" +either = "1.15.0" indicatif = "0.17.9" json="0.12" lazy_static = "1.4.0" diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs index 89f41ee..684adda 100644 --- a/src/phases/tokenizer.rs +++ b/src/phases/tokenizer.rs @@ -117,7 +117,7 @@ pub fn run(input_path: &str, example_word: &str, _logger: &Logger) -> Result<()> example_word ); info!( - " The token '{}' appears {} times and is ranked {} in the global Bag of Words.", + " The token '{}' appears {} times and is ranked {} in the global Bag of Words (rank 1 = least common token).", example_word, token_rankings .get(example_word_token) diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs index 8a95635..1a44e87 100644 --- a/src/phases/type_3_duplicate_files.rs +++ b/src/phases/type_3_duplicate_files.rs @@ -8,12 +8,15 @@ use anyhow::{/* Error, */ Result}; use blake3; use clap::{Arg, Command}; use core::f64; +use either::Either; use polars::prelude::*; -use std::cmp::{max, min, Reverse}; +use std::cmp::{max, min}; use std::collections::{HashMap, HashSet}; use std::vec; use tracing::info; +type CloneMap = HashMap, blake3::Hash>>; + pub fn cli() -> Command { Command::new("type_3_duplicate_files") .about("Detects type 3 clones by building an index based on the most common tokens in the functions and their frequencies.") @@ -101,7 +104,7 @@ pub fn run( ) -> Result<()> { //let language = "java"; let language = opt_language.unwrap_or("java"); //default to java currently - let minimum_loc = 5; //temporary + let minimum_loc = 2; //temporary let mut input_file = open_csv( input_path, Some(Schema::from_iter(vec![ @@ -276,12 +279,10 @@ fn index_builder( let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase()); let mut vectored_bow = local_bow.vectorize(); vectored_bow.sort_by_key(|(token, _)| { - Reverse( - token_rankings - .get(token) - .map(|(_, rank)| *rank) - .unwrap_or(usize::MAX), - ) + token_rankings + .get(token) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX) }); //let codeblock_length = vectored_bow.iter().map(|(_, count)| count).sum::(); let codeblock_length = function_paths_and_lengths @@ -371,9 +372,9 @@ fn detect_clones( vector_of_indices: &[InvertedIndex], //changed from &Vec since the compiler requested it threshold: f64, function_paths_and_lengths: &HashMap, -) -> Result>> { +) -> Result { // result will probably be a 'clone-map'. Unsure for now if it has to be its own data-structure or if i can reuse the candidate map from before. - let mut clone_map: HashMap> = HashMap::new(); //key is the original function id, value is a set of clones of that function + let mut clone_map: CloneMap = HashMap::new(); let word_matcher: Matcher = Matcher::words_matcher(); let p_prefix = vector_of_indices.len(); @@ -385,14 +386,13 @@ fn detect_clones( let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase()); let mut origin_vectored_bow = local_bow.vectorize(); origin_vectored_bow.sort_by_key(|(token, _)| { - Reverse( - token_rankings - .get(token) - .map(|(_, rank)| *rank) - .unwrap_or(usize::MAX), - ) + token_rankings + .get(token) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX) }); let origin_function_id = blake3::hash(path.as_bytes()); + info!("Origin path: {}", path); let mut candidate_map = CandidateMap::new(); let prefix_length = origin_word_count @@ -427,34 +427,43 @@ fn detect_clones( .get(&token_tuple.0) .unwrap_or(&Vec::new()) { + /* if candidate_id_lt_origin_id(&candidate.0, &origin_function_id) { + info!("DClone: SKIPPING candidate at path '{}' since it has a lower function ID than the origin.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown")); + continue; //skip candidates that have already been processed as origins + } */ + if clone_map.contains_key(&candidate.0) { + info!("DClone: SKIPPING candidate at path '{}' since it already has an entry in clone_map.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown")); + continue; + } let candidate_word_count = function_paths_and_lengths .get(&candidate.0) .map(|(_, count)| *count) .unwrap_or(0); if candidate_word_count - > ((*origin_word_count as f64) * threshold).round() as usize + < ((*origin_word_count as f64) * threshold).round() as usize + { + continue; //skip candidates that are too small to reach the threshold + } + + let new_matches = min(token_tuple.1, candidate.1); + let function_id = candidate.0; + let last_token_seen_pos = candidate.2; // (token_position, cumulative_count) + let current_threshold = + (max(*origin_word_count, candidate_word_count) as f64 * threshold) + .round() as usize; + let upper_bound = min( + *origin_word_count - origin_cumulative_count, + candidate_word_count - last_token_seen_pos.1 + new_matches, //candidate.2.1 is the number of words seen up to and including this token including duplicates + ); + if candidate_map.get_token_matches(&function_id) + upper_bound + >= current_threshold { - let new_matches = min(token_tuple.1, candidate.1); - let function_id = candidate.0; - let last_token_seen_pos = candidate.2; // (token_position, cumulative_count) - let current_threshold = - (max(*origin_word_count, candidate_word_count) as f64 - * threshold) - .round() as usize; - let upper_bound = min( - *origin_word_count - origin_cumulative_count, - candidate_word_count - last_token_seen_pos.1 + new_matches, //candidate.2.1 is the number of words seen up to and including this token including duplicates + candidate_map.add_pending_update( + function_id, + new_matches, + last_token_seen_pos, ); - if candidate_map.get_token_matches(&function_id) + upper_bound - >= current_threshold - { - candidate_map.add_pending_update( - function_id, - new_matches, - last_token_seen_pos, - ); - } } } origin_token_position += 1; @@ -463,7 +472,8 @@ fn detect_clones( candidate_map.apply_pending_updates(function_paths_and_lengths); //apply updates for the first prefix scheme before estimating costs since it relies on min/max length } - let verification_cost = candidate_map.verification_cost_estimate(p); + let verification_cost = + candidate_map.verification_cost_estimate(p, origin_word_count); filter_cost_vector.push(filter_cost); verification_cost_vector.push(verification_cost); total_cost_vector.push(filter_cost + verification_cost); @@ -522,9 +532,9 @@ fn detect_clones( fn verify_candidates( origin_function_id: blake3::Hash, origin_vectored_bow: &Vec<(Vec, usize)>, - origin_last_token_seen_pos: (usize, usize), + prefix_origin_last_token_seen_pos: (usize, usize), candidate_map: &mut CandidateMap, - clone_map: &mut HashMap>, + clone_map: &mut CloneMap, p_prefix: usize, token_rankings: &HashMap, (usize, usize)>, threshold: f64, @@ -544,106 +554,127 @@ fn verify_candidates( ); let origin_vectored_bow = origin_vectored_bow.to_owned(); let origin_token_count = origin_vectored_bow.len(); - let candidates_to_verify = candidate_map.get_candidates_with_n_matches(p_prefix, "at_least"); - let mut origin_last_token_seen_pos = origin_last_token_seen_pos; // (token_position, cumulative_count) + let candidates_to_verify = candidate_map.get_candidates_with_n_matches(p_prefix, "at_least"); // (token_position, cumulative_count) + let origin_vector_readable: Vec<(String, usize)> = origin_vectored_bow + .iter() + .map(|(token, count)| (String::from_utf8_lossy(token).to_string(), *count)) + .collect(); + info!("sorted origin vector: {:?}", origin_vector_readable); + info!("origin_function_id: {:?}", origin_function_id); + info!("origin_id as bytes: {:?}", origin_function_id.as_bytes()); for candidate_id in candidates_to_verify { + info!("----------------------"); let (path, length) = function_paths_and_lengths .get(&candidate_id) .copied() .unwrap(); + info!("candidate_id as bytes: {:?}", candidate_id.as_bytes()); + if clone_map.contains_key(&candidate_id) { + info!( + "SKIPPING candidate at path '{}' since it already has an entry in clone_map.", + path + ); + continue; + } if candidate_id == origin_function_id { + info!("Skipping self-comparison for function at path '{}'.", path); continue; //skip comparing the function to itself } + let mut origin_last_token_seen_pos = prefix_origin_last_token_seen_pos; match load_file(path, 1024 * 1024 * 1024) { Ok(Ok(candidate_code)) => { // Handle successful file load // load function, sort tokens by global frequency, calculate similarity, if above threshold add to clone map + info!("Candidate loaded: {}, length: {}", path, length); let candidate_bow = word_matcher.bag_of_words(&candidate_code.to_ascii_lowercase()); let mut vectored_candidate_bow = candidate_bow.vectorize(); vectored_candidate_bow.sort_by_key(|(token, _)| { - Reverse( - token_rankings - .get(token) - .map(|(_, rank)| *rank) - .unwrap_or(usize::MAX), - ) + token_rankings + .get(token) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX) }); + let candidate_vector_readable: Vec<(String, usize)> = vectored_candidate_bow + .iter() + .map(|(token, count)| (String::from_utf8_lossy(token).to_string(), *count)) + .collect(); + info!("sorted candidate vector: {:?}", candidate_vector_readable); let candidate_word_count = length; let candidate_token_count = vectored_candidate_bow.len(); let current_threshold = (max(origin_word_count, candidate_word_count) as f64 * threshold) .round() as usize; - info!("Current threshold: {}", current_threshold); let mut candidate_last_token_seen_pos = candidate_map.get_last_token_seen_pos(&candidate_id); // (token_position, cumulative_count) let mut new_matches = 0usize; + let prefix_matches = candidate_map.get_token_matches(&candidate_id); while origin_last_token_seen_pos.0 < origin_token_count && candidate_last_token_seen_pos.0 < candidate_token_count { let upper_bound = min( origin_word_count - origin_last_token_seen_pos.1, candidate_word_count - candidate_last_token_seen_pos.1, - ) + candidate_map.get_token_matches(&candidate_id); + ); + let current_matches = prefix_matches + new_matches; + let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0]; + let candidate_token_tuple = + &vectored_candidate_bow[candidate_last_token_seen_pos.0]; + + info!("Current threshold: {}", current_threshold); + info!( + "Current matches: {} + {} = {}", + prefix_matches, new_matches, current_matches + ); + info!("Upper bound of remaining matches: {}", upper_bound); + + let origin_rank = token_rankings + .get(&origin_token_tuple.0) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX); + let candidate_rank = token_rankings + .get(&candidate_token_tuple.0) + .map(|(_, rank)| *rank) + .unwrap_or(usize::MAX); - if upper_bound > current_threshold { - info!("IF MIN"); - let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0]; - let candidate_token_tuple = - &vectored_candidate_bow[candidate_last_token_seen_pos.0]; + info!( + "Origin: {}, rank: {}, position: {} | Candidate: {}, rank: {}, position: {}", + String::from_utf8_lossy(&origin_token_tuple.0), + origin_rank, + origin_last_token_seen_pos.0, + String::from_utf8_lossy(&candidate_token_tuple.0), + candidate_rank, + candidate_last_token_seen_pos.0 + ); + + if current_matches >= current_threshold { + //already reached the threshold, we can stop comparing this candidate and add it to the clone map + info!( + "Threshold reached with current matches {}, adding to clone map.", + current_matches + ); + break; + } else if upper_bound + current_matches >= current_threshold { if origin_token_tuple.0 == candidate_token_tuple.0 { //it's a match info!("MATCHING!"); - info!( - "MATCH! origin: {}, candidate: {}", - String::from_utf8_lossy(&origin_token_tuple.0), - String::from_utf8_lossy(&candidate_token_tuple.0) - ); new_matches += min(origin_token_tuple.1, candidate_token_tuple.1); candidate_last_token_seen_pos.0 += 1; candidate_last_token_seen_pos.1 += candidate_token_tuple.1; origin_last_token_seen_pos.0 += 1; origin_last_token_seen_pos.1 += origin_token_tuple.1; - } else if token_rankings - .get(&origin_token_tuple.0) - .map(|(_, rank)| *rank) - .unwrap_or(usize::MAX) - < token_rankings - .get(&candidate_token_tuple.0) - .map(|(_, rank)| *rank) - .unwrap_or(usize::MAX) - { - //origin token is more frequent than candidate token, so we move in the origin vector - info!( - "origin_count > candidate_count: origin: {}, candidate: {}", - String::from_utf8_lossy(&origin_token_tuple.0), - String::from_utf8_lossy(&candidate_token_tuple.0) - ); - origin_last_token_seen_pos.0 += 1; - origin_last_token_seen_pos.1 += origin_token_tuple.1; - } else { - //candidate token is more frequent than origin token, so we move in the candidate vector - info!( - "candidate_count > origin_count: origin: {}, candidate: {}", - String::from_utf8_lossy(&origin_token_tuple.0), - String::from_utf8_lossy(&candidate_token_tuple.0) - ); + } else if origin_rank > candidate_rank { + //origin token is more frequent than candidate token, so we move in the candidate vector + info!("origin_count > candidate_count"); candidate_last_token_seen_pos.0 += 1; - candidate_last_token_seen_pos.1 += candidate_token_tuple.1; + candidate_last_token_seen_pos.1 += origin_token_tuple.1; + } else { + //candidate token is more frequent than origin token, so we move in the origin vector + info!("candidate_count > origin_count"); + origin_last_token_seen_pos.0 += 1; + origin_last_token_seen_pos.1 += candidate_token_tuple.1; } } else { - //the upper bound of the remaining matches is not enough to reach the threshold, so we can stop comparing this candidate - /* info!("UPPER BOUND of remaining matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", min(origin_word_count - origin_last_token_seen_pos.1, candidate_word_count - candidate_last_token_seen_pos.1), current_threshold); - info!( - "Current matches: {}, new matches: {}, total possible matches: {}", - candidate_map.get_token_matches(&candidate_id), - new_matches, - candidate_map.get_token_matches(&candidate_id) - + min( - origin_word_count - origin_last_token_seen_pos.1, - candidate_word_count - candidate_last_token_seen_pos.1 - ) - ); */ - info!("UPPER BOUND of remaining matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound, current_threshold); + info!("UPPER BOUND + current_matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound + current_matches, current_threshold); info!( "origin_last_token_seen_pos: {}, candidate_last_token_seen_pos: {}", origin_last_token_seen_pos.0, candidate_last_token_seen_pos.0 @@ -658,13 +689,14 @@ fn verify_candidates( candidate_last_token_seen_pos, ); if candidate_map.get_token_matches(&candidate_id) >= current_threshold { - //add to clone map - clone_map - .entry(origin_function_id) - .or_default() - .insert(candidate_id); + insert_clone_relation(clone_map, origin_function_id, candidate_id); + info!("*** CLONE DETECTED! ***"); info!( - "Clone detected! Candidate: {}, Similarity: {:.2} %", + "Origin: {}, Candidate: {}, Similarity >= {:.2} %", + function_paths_and_lengths + .get(&origin_function_id) + .map(|(path, _)| *path) + .unwrap_or("Unknown"), function_paths_and_lengths .get(&candidate_id) .map(|(path, _)| *path) @@ -674,6 +706,7 @@ fn verify_candidates( * 100.0 ); } + info!("**********") } Ok(Err(_)) => { info!("Warning: File too large at path '{}', skipping.", path); @@ -685,3 +718,23 @@ fn verify_candidates( } Ok(()) } + +fn insert_clone_relation( + clone_map: &mut CloneMap, + origin_function_id: blake3::Hash, + candidate_id: blake3::Hash, +) { + let origin_entry = clone_map + .entry(origin_function_id) + .or_insert_with(|| Either::Left(HashSet::new())); + + // Origin must always store the set of its clones as Left(HashSet<_>). + if let Either::Left(clones) = origin_entry { + clones.insert(candidate_id); + } else { + *origin_entry = Either::Left(HashSet::from([candidate_id])); + } + + // Clone points back to its origin as Right(origin_hash). + clone_map.insert(candidate_id, Either::Right(origin_function_id)); +} diff --git a/src/utils/bow.rs b/src/utils/bow.rs index 897b0a8..1dc087d 100644 --- a/src/utils/bow.rs +++ b/src/utils/bow.rs @@ -99,9 +99,9 @@ impl Bow { pub fn token_rankings(&self) -> HashMap, (usize, usize)> { let mut rankings: HashMap, (usize, usize)> = HashMap::new(); let mut count_vec: Vec<(&Vec, &usize)> = self.map.iter().collect(); - //count_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count in descending order + //count_vec.sort_by(|a, b| a.1.cmp(b.1)); // Sort by count in ascending order count_vec.sort_by(|a, b| { - b.1.cmp(a.1) // primary: count descending + a.1.cmp(b.1) // primary: count ascending .then_with(|| a.0.cmp(b.0)) // secondary: token ascending }); for (rank, (token, count)) in count_vec.into_iter().enumerate() { diff --git a/src/utils/candidate_map.rs b/src/utils/candidate_map.rs index 52bee53..06d8198 100644 --- a/src/utils/candidate_map.rs +++ b/src/utils/candidate_map.rs @@ -160,7 +160,7 @@ impl CandidateMap { } } - pub fn verification_cost_estimate(&self, n: usize) -> usize { + pub fn verification_cost_estimate(&self, n: usize, origin_word_count: &usize) -> usize { let mut number_of_candidates = self.count_candidates_with_n_matches(n, "at_least"); //the candidates that have already reached n matches let mut survivors = 0usize; @@ -183,6 +183,6 @@ impl CandidateMap { } else { (length_range.0 + length_range.1) / 2 }; - number_of_candidates * average_length + number_of_candidates * (*origin_word_count + average_length) } } From 424991b475f659e78122e5b8e74b0e3df91697d4 Mon Sep 17 00:00:00 2001 From: swartling Date: Mon, 27 Apr 2026 14:40:31 +0200 Subject: [PATCH 12/14] java example functions added --- tests/data/keywords/java.json | 3 +++ .../files/java_examples/CB1/factorial.java | 4 ++++ .../files/java_examples/CB1/factorial.java.functions/2-1 | 3 +++ .../files/java_examples/CB2/factorial.java | 8 ++++++++ .../files/java_examples/CB2/factorial.java.functions/2-1 | 7 +++++++ .../files/java_examples/CB3/factorial.java | 8 ++++++++ .../files/java_examples/CB3/factorial.java.functions/2-1 | 7 +++++++ .../files/java_examples/CB4/main.java | 8 ++++++++ .../files/java_examples/CB4/main.java.functions/2-1 | 7 +++++++ .../files/java_examples/CB5/factorial.java | 6 ++++++ .../files/java_examples/CB5/factorial.java.functions/2-1 | 5 +++++ 11 files changed, 66 insertions(+) create mode 100644 tests/data/keywords/java.json create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java.functions/2-1 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java.functions/2-1 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java.functions/2-1 create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java.functions/2-1 diff --git a/tests/data/keywords/java.json b/tests/data/keywords/java.json new file mode 100644 index 0000000..e0f964a --- /dev/null +++ b/tests/data/keywords/java.json @@ -0,0 +1,3 @@ +{ + "languages": ["java"] +} \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java new file mode 100644 index 0000000..71d2783 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java @@ -0,0 +1,4 @@ +//Code Block 1 (CB1) +public static int factorial(int result) { + if(result <= 1) return 1; + return result * factorial(result-1); } \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1 new file mode 100644 index 0000000..07a6e64 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1 @@ -0,0 +1,3 @@ +public static int factorial(int result) { + if(result <= 1) return 1; + return result * factorial(result-1); } \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java new file mode 100644 index 0000000..85cffd4 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java @@ -0,0 +1,8 @@ +//Code Block 2 (CB2) +public static int factorial(int n) { + int result = 1; + for(int i=1; i<=n; i++) { + result = result * i; + } + return result; +} \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java.functions/2-1 new file mode 100644 index 0000000..9c06340 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB2/factorial.java.functions/2-1 @@ -0,0 +1,7 @@ +public static int factorial(int n) { + int result = 1; + for(int i=1; i<=n; i++) { + result = result * i; + } + return result; +} \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java new file mode 100644 index 0000000..6de0bc4 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java @@ -0,0 +1,8 @@ +//Code Block 3 (CB3) +public static int factorial(int n) { + if(n >= 0) { + result[0] = 1; + for(int i=1; i<=n; i++) { + result[i] = i * result[i-1]; + } + return result[n]; } } \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java.functions/2-1 new file mode 100644 index 0000000..725c461 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB3/factorial.java.functions/2-1 @@ -0,0 +1,7 @@ +public static int factorial(int n) { + if(n >= 0) { + result[0] = 1; + for(int i=1; i<=n; i++) { + result[i] = i * result[i-1]; + } + return result[n]; } } \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java new file mode 100644 index 0000000..87d59bb --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java @@ -0,0 +1,8 @@ +//Code Block 4 (CB4) +public static void main(String[] args) { + int result = 5; + int factorial = result; + for(int i=result-1; i>1; i--) { + factorial = factorial * i; + } +} \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java.functions/2-1 new file mode 100644 index 0000000..02d6edf --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB4/main.java.functions/2-1 @@ -0,0 +1,7 @@ +public static void main(String[] args) { + int result = 5; + int factorial = result; + for(int i=result-1; i>1; i--) { + factorial = factorial * i; + } +} \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java new file mode 100644 index 0000000..2ed89f7 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java @@ -0,0 +1,6 @@ +//Code Block 5 (CB5) +public int factorial(int result) { + if(result == 0) { + return 1; + } else { + return result * factorial(result-1); } } \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java.functions/2-1 b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java.functions/2-1 new file mode 100644 index 0000000..cd86a1e --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB5/factorial.java.functions/2-1 @@ -0,0 +1,5 @@ +public int factorial(int result) { + if(result == 0) { + return 1; + } else { + return result * factorial(result-1); } } \ No newline at end of file From 8334756271b77b27b96775748d67e81a4643c593 Mon Sep 17 00:00:00 2001 From: swartling Date: Mon, 27 Apr 2026 15:05:28 +0200 Subject: [PATCH 13/14] switched info! to debug! or warn! --- src/phases/type_3_duplicate_files.rs | 88 ++++++++++++++-------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs index 1a44e87..b3337cc 100644 --- a/src/phases/type_3_duplicate_files.rs +++ b/src/phases/type_3_duplicate_files.rs @@ -13,7 +13,7 @@ use polars::prelude::*; use std::cmp::{max, min}; use std::collections::{HashMap, HashSet}; use std::vec; -use tracing::info; +use tracing::{debug, info, warn}; type CloneMap = HashMap, blake3::Hash>>; @@ -208,26 +208,26 @@ pub fn run( let word = word.to_owned().as_bytes().to_ascii_lowercase(); let mut index_number = 1; for index in vector_of_indices.iter() { - info!( + debug!( "Index {} has {} entries, total length of vectors in entries: {}", index_number, index.len(), index.len_tokens() ); if let Some(entries) = index.get(&word) { - info!( + debug!( "Entries for the example word '{}' in index {}ยง:", String::from_utf8_lossy(&word), index_number ); for (function_id, count, (token_position, cumulative_count)) in entries { - info!( + debug!( "Function ID: {}, Count: {}, Token Position: {}, Cumulative Count: {}", function_id, count, token_position, cumulative_count ); } } else { - info!( + debug!( "The example word '{}' was not found in index {}.", String::from_utf8_lossy(&word), index_number @@ -319,10 +319,10 @@ fn index_builder( } } Ok(Err(_e)) => { - info!("Warning: File too large at path '{}', skipping.", path); + warn!("File too large at path '{}', skipping.", path); } Err(_e) => { - info!("Failed to read file at path '{}', skipping.", path); + warn!("Failed to read file at path '{}', skipping.", path); } } } @@ -352,7 +352,7 @@ fn delta_filter_cost( fn weighted_prefix_end(vectored_bow: &[(Vec, usize)], prefix_length: usize) -> usize { if prefix_length == 0 { - info!("Prefix length is 0, returning 0 for weighted prefix end."); + debug!("Prefix length is 0, returning 0 for weighted prefix end."); // This case shouldn't be seen return 0; } @@ -363,7 +363,7 @@ fn weighted_prefix_end(vectored_bow: &[(Vec, usize)], prefix_length: usize) return idx + 1; //Enumerator is 0-based, so we need to add 1 to get the correct length of the prefix vector } } - info!("Warning: prefix_length {} is greater than total token count {}, returning full length of vectored_bow.", prefix_length, vectored_bow.len()); + warn!("prefix_length {} is greater than total token count {}, returning full length of vectored_bow.", prefix_length, vectored_bow.len()); vectored_bow.len() } @@ -379,7 +379,7 @@ fn detect_clones( let word_matcher: Matcher = Matcher::words_matcher(); let p_prefix = vector_of_indices.len(); for (path, origin_word_count) in function_paths_and_lengths.values() { - info!("-----------------------------------------------------------------------------"); + debug!("-----------------------------------------------------------------------------"); // info!("Path: {}, Words: {}", path, origin_word_count); match load_file(path, 1024 * 1024 * 1024) { Ok(Ok(function_code)) => { @@ -392,7 +392,7 @@ fn detect_clones( .unwrap_or(usize::MAX) }); let origin_function_id = blake3::hash(path.as_bytes()); - info!("Origin path: {}", path); + debug!("Origin path: {}", path); let mut candidate_map = CandidateMap::new(); let prefix_length = origin_word_count @@ -432,7 +432,7 @@ fn detect_clones( continue; //skip candidates that have already been processed as origins } */ if clone_map.contains_key(&candidate.0) { - info!("DClone: SKIPPING candidate at path '{}' since it already has an entry in clone_map.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown")); + debug!("DClone: SKIPPING candidate at path '{}' since it already has an entry in clone_map.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown")); continue; } let candidate_word_count = function_paths_and_lengths @@ -479,8 +479,8 @@ fn detect_clones( total_cost_vector.push(filter_cost + verification_cost); if total_cost_vector[p] > total_cost_vector[p - 1] { - info!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]); - info!("The next prefix scheme {} has estimated total cost of {}, filter cost: {}, verification cost: {}.", p, total_cost_vector[p], filter_cost_vector[p], verification_cost_vector[p]); + debug!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. Moving on to verification phase.", p - 1, total_cost_vector[p - 1], filter_cost_vector[p - 1], verification_cost_vector[p - 1]); + debug!("The next prefix scheme {} has estimated total cost of {}, filter cost: {}, verification cost: {}.", p, total_cost_vector[p], filter_cost_vector[p], verification_cost_vector[p]); verify_candidates( origin_function_id, &origin_vectored_bow, @@ -513,15 +513,15 @@ fn detect_clones( } } } - info!("Filter cost vector: {:?}", filter_cost_vector); - info!("Verification cost vector: {:?}", verification_cost_vector); - info!("Total cost vector: {:?}", total_cost_vector); + debug!("Filter cost vector: {:?}", filter_cost_vector); + debug!("Verification cost vector: {:?}", verification_cost_vector); + debug!("Total cost vector: {:?}", total_cost_vector); } Ok(Err(_e)) => { - info!("Warning: File too large at path '{}', skipping.", path); + warn!("File too large at path '{}', skipping.", path); } Err(_e) => { - info!("Failed to read file at path '{}', skipping.", path); + warn!("Failed to read file at path '{}', skipping.", path); } } } @@ -548,7 +548,7 @@ fn verify_candidates( .get(&origin_function_id) .copied() .unwrap_or(("Unknown", 0)); - info!( + debug!( "Verifying candidates for function at path '{}', with word count {}.", origin_path, origin_word_count ); @@ -559,25 +559,25 @@ fn verify_candidates( .iter() .map(|(token, count)| (String::from_utf8_lossy(token).to_string(), *count)) .collect(); - info!("sorted origin vector: {:?}", origin_vector_readable); - info!("origin_function_id: {:?}", origin_function_id); - info!("origin_id as bytes: {:?}", origin_function_id.as_bytes()); + debug!("sorted origin vector: {:?}", origin_vector_readable); + debug!("origin_function_id: {:?}", origin_function_id); + debug!("origin_id as bytes: {:?}", origin_function_id.as_bytes()); for candidate_id in candidates_to_verify { - info!("----------------------"); + debug!("----------------------"); let (path, length) = function_paths_and_lengths .get(&candidate_id) .copied() .unwrap(); - info!("candidate_id as bytes: {:?}", candidate_id.as_bytes()); + debug!("candidate_id as bytes: {:?}", candidate_id.as_bytes()); if clone_map.contains_key(&candidate_id) { - info!( + debug!( "SKIPPING candidate at path '{}' since it already has an entry in clone_map.", path ); continue; } if candidate_id == origin_function_id { - info!("Skipping self-comparison for function at path '{}'.", path); + debug!("Skipping self-comparison for function at path '{}'.", path); continue; //skip comparing the function to itself } let mut origin_last_token_seen_pos = prefix_origin_last_token_seen_pos; @@ -585,7 +585,7 @@ fn verify_candidates( Ok(Ok(candidate_code)) => { // Handle successful file load // load function, sort tokens by global frequency, calculate similarity, if above threshold add to clone map - info!("Candidate loaded: {}, length: {}", path, length); + debug!("Candidate loaded: {}, length: {}", path, length); let candidate_bow = word_matcher.bag_of_words(&candidate_code.to_ascii_lowercase()); let mut vectored_candidate_bow = candidate_bow.vectorize(); vectored_candidate_bow.sort_by_key(|(token, _)| { @@ -598,7 +598,7 @@ fn verify_candidates( .iter() .map(|(token, count)| (String::from_utf8_lossy(token).to_string(), *count)) .collect(); - info!("sorted candidate vector: {:?}", candidate_vector_readable); + debug!("sorted candidate vector: {:?}", candidate_vector_readable); let candidate_word_count = length; let candidate_token_count = vectored_candidate_bow.len(); let current_threshold = (max(origin_word_count, candidate_word_count) as f64 @@ -620,12 +620,12 @@ fn verify_candidates( let candidate_token_tuple = &vectored_candidate_bow[candidate_last_token_seen_pos.0]; - info!("Current threshold: {}", current_threshold); - info!( + debug!("Current threshold: {}", current_threshold); + debug!( "Current matches: {} + {} = {}", prefix_matches, new_matches, current_matches ); - info!("Upper bound of remaining matches: {}", upper_bound); + debug!("Upper bound of remaining matches: {}", upper_bound); let origin_rank = token_rankings .get(&origin_token_tuple.0) @@ -636,7 +636,7 @@ fn verify_candidates( .map(|(_, rank)| *rank) .unwrap_or(usize::MAX); - info!( + debug!( "Origin: {}, rank: {}, position: {} | Candidate: {}, rank: {}, position: {}", String::from_utf8_lossy(&origin_token_tuple.0), origin_rank, @@ -648,7 +648,7 @@ fn verify_candidates( if current_matches >= current_threshold { //already reached the threshold, we can stop comparing this candidate and add it to the clone map - info!( + debug!( "Threshold reached with current matches {}, adding to clone map.", current_matches ); @@ -656,7 +656,7 @@ fn verify_candidates( } else if upper_bound + current_matches >= current_threshold { if origin_token_tuple.0 == candidate_token_tuple.0 { //it's a match - info!("MATCHING!"); + debug!("MATCHING!"); new_matches += min(origin_token_tuple.1, candidate_token_tuple.1); candidate_last_token_seen_pos.0 += 1; candidate_last_token_seen_pos.1 += candidate_token_tuple.1; @@ -664,18 +664,18 @@ fn verify_candidates( origin_last_token_seen_pos.1 += origin_token_tuple.1; } else if origin_rank > candidate_rank { //origin token is more frequent than candidate token, so we move in the candidate vector - info!("origin_count > candidate_count"); + debug!("origin_count > candidate_count"); candidate_last_token_seen_pos.0 += 1; candidate_last_token_seen_pos.1 += origin_token_tuple.1; } else { //candidate token is more frequent than origin token, so we move in the origin vector - info!("candidate_count > origin_count"); + debug!("candidate_count > origin_count"); origin_last_token_seen_pos.0 += 1; origin_last_token_seen_pos.1 += candidate_token_tuple.1; } } else { - info!("UPPER BOUND + current_matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound + current_matches, current_threshold); - info!( + debug!("UPPER BOUND + current_matches is {}, which is not enough to reach the threshold of {}. Stopping comparison for this candidate.", upper_bound + current_matches, current_threshold); + debug!( "origin_last_token_seen_pos: {}, candidate_last_token_seen_pos: {}", origin_last_token_seen_pos.0, candidate_last_token_seen_pos.0 ); @@ -690,8 +690,8 @@ fn verify_candidates( ); if candidate_map.get_token_matches(&candidate_id) >= current_threshold { insert_clone_relation(clone_map, origin_function_id, candidate_id); - info!("*** CLONE DETECTED! ***"); - info!( + debug!("*** CLONE DETECTED! ***"); + debug!( "Origin: {}, Candidate: {}, Similarity >= {:.2} %", function_paths_and_lengths .get(&origin_function_id) @@ -706,13 +706,13 @@ fn verify_candidates( * 100.0 ); } - info!("**********") + debug!("**********") } Ok(Err(_)) => { - info!("Warning: File too large at path '{}', skipping.", path); + warn!("File too large at path '{}', skipping.", path); } Err(_) => { - info!("Failed to read file at path '{}', skipping.", path); + warn!("Failed to read file at path '{}', skipping.", path); } } } From b0777c9e6da1cf12710cbb49e1c97f65eb4bc6ab Mon Sep 17 00:00:00 2001 From: swartling Date: Wed, 6 May 2026 14:22:35 +0200 Subject: [PATCH 14/14] probably final implementation --- src/bin/main.rs | 2 + src/phases/parse.rs | 2 - src/phases/tokenizer.rs | 27 ++- src/phases/type_3_duplicate_files.rs | 190 +++++++++++++++--- .../files/java_examples/CB6/factorial.java | 4 + .../identical_functions.csv | 3 + 6 files changed, 191 insertions(+), 37 deletions(-) create mode 100644 tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java create mode 100644 tests/data/phases/type_3_duplicate_files/identical_functions.csv diff --git a/src/bin/main.rs b/src/bin/main.rs index 715e788..bad656f 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -272,6 +272,8 @@ fn main() { *cli_subargs.get_one::("p_prefix").unwrap(), *cli_subargs.get_one::("threshold").unwrap(), cli_subargs.get_one::("example_word"), + cli_subargs.get_flag("force"), + cli_subargs.get_one::("header").unwrap(), &logger, ) } diff --git a/src/phases/parse.rs b/src/phases/parse.rs index cabaf25..8ed9661 100644 --- a/src/phases/parse.rs +++ b/src/phases/parse.rs @@ -455,7 +455,6 @@ fn analyze_file( ignore_comments: bool, word_counter: &Matcher, ) -> Result<(String, Option)> { - info!("analyze_file called with path: {path}"); let grammar = language_to_grammar(language) .with_context(|| format!("Unsupported language: {language}"))?; // Initializes the parser @@ -463,7 +462,6 @@ fn analyze_file( parser.set_language(&grammar.lang)?; match load_file(path, 1024 * 1024 * 1024)? { Ok(source_code) => { - info!("File {path} loaded successfully"); // Creates a folder to store the functions of the file let target_folder: String = format!("{path}.functions"); create_dir(&target_folder)?; diff --git a/src/phases/tokenizer.rs b/src/phases/tokenizer.rs index 684adda..c7b46bf 100644 --- a/src/phases/tokenizer.rs +++ b/src/phases/tokenizer.rs @@ -149,15 +149,24 @@ pub fn run(input_path: &str, example_word: &str, _logger: &Logger) -> Result<()> } */ pub fn global_counter(input_file: &DataFrame) -> Result { + use indicatif::ProgressBar; + use std::time::Instant; + info!("Building global Bag of Words from the functions in the input file..."); + let bow_start = Instant::now(); + let paths_column = input_file.column("path").and_then(|c| c.str())?; + let total_files = paths_column.len(); + + let bow_progress = ProgressBar::new(total_files as u64); + bow_progress.set_style( + indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%")?, + ); + bow_progress.set_message("Building global bag-of-words..."); + let word_matcher: Matcher = Matcher::words_matcher(); let mut global_bow: Bow = Bow::new(); - for row in input_file - .column("path") - .and_then(|c| c.str()) - .unwrap() - .into_iter() - { + for row in paths_column.into_iter() { + bow_progress.inc(1); match row { Some(path) => { //let function_code = std::fs::read_to_string(path)?; @@ -168,7 +177,7 @@ pub fn global_counter(input_file: &DataFrame) -> Result { global_bow.merge(local_bow); } Ok(Err(_e)) => { - info!(" Warning: File to large at path {}", path); + info!(" Warning: File too large at path {}", path); } Err(_e) => { info!(" Warning: Could not load file at path {}", path); @@ -181,6 +190,10 @@ pub fn global_counter(input_file: &DataFrame) -> Result { } } + bow_progress.finish_and_clear(); + let bow_duration = bow_start.elapsed(); + info!("BOW building took: {:.2}s", bow_duration.as_secs_f64()); + Ok(global_bow) } diff --git a/src/phases/type_3_duplicate_files.rs b/src/phases/type_3_duplicate_files.rs index b3337cc..59d1bf8 100644 --- a/src/phases/type_3_duplicate_files.rs +++ b/src/phases/type_3_duplicate_files.rs @@ -2,16 +2,19 @@ use crate::phases::tokenizer::global_counter; use crate::utils::candidate_map::*; use crate::utils::fs::*; use crate::utils::inverted_index::*; -use crate::utils::logger::Logger; +use crate::utils::logger::{log_output_file, log_write_output, Logger}; use crate::utils::regex::*; use anyhow::{/* Error, */ Result}; use blake3; +use clap::ArgAction; use clap::{Arg, Command}; use core::f64; use either::Either; +use indicatif::ProgressBar; use polars::prelude::*; use std::cmp::{max, min}; use std::collections::{HashMap, HashSet}; +use std::time::Instant; use std::vec; use tracing::{debug, info, warn}; @@ -60,7 +63,7 @@ pub fn cli() -> Command { .arg( Arg::new("threads") .short('n') - .help("Number of threads to use, default is 1.") + .help("Number of threads to use, default is 1. CURRENT VERSION IS SINGLE THREADED") .default_value("1") .value_parser(clap::value_parser!(usize)) ) @@ -87,6 +90,20 @@ pub fn cli() -> Command { .help("An example word to check the global Bag of Words for.") .required(false), ) + .arg( + Arg::new("force") + .short('f') + .long("force") + .help("Override the output CSV file if it already exists.") + .default_value("false") + .action(ArgAction::SetTrue) + ) + .arg( + Arg::new("header") + .long("header") + .help("Name of column storing file paths in the input CSV file.") + .default_value("path"), + ) } pub fn run( @@ -100,11 +117,14 @@ pub fn run( p_prefix: usize, //number of tokens to consider for the prefix, default is 1 threshold: f64, //threshold for the prefix length, default is 0.8 example_word: Option<&String>, //an example word to check the global Bag of Words for, optional + force: bool, //whether to override the output CSV file if it already exists + input_header: &str, //name of column storing file paths in the input CSV file _logger: &Logger, ) -> Result<()> { //let language = "java"; let language = opt_language.unwrap_or("java"); //default to java currently - let minimum_loc = 2; //temporary + let minimum_loc = 0; //temporary + let total_start = Instant::now(); let mut input_file = open_csv( input_path, Some(Schema::from_iter(vec![ @@ -156,18 +176,8 @@ pub fn run( let n_functions_after_loc = input_file.height(); - info!( - "{} functions found after filtering ({:.2} %)", //something is weird with the percentage calculation here. - n_functions_after_loc, - if n_functions_before_loc == 0 { - 0 - } else { - (n_functions_after_loc as f64 / n_functions_before_loc as f64 * 100.0) as usize - } - ); - //moved here from detect_clones - let paths_column = input_file.column("path")?.str()?; + let paths_column = input_file.column(input_header)?.str()?; let words_column = input_file.column("words")?.u32()?; let rows: Vec<(&str, usize)> = paths_column .into_iter() @@ -185,6 +195,7 @@ pub fn run( let global_bow = global_counter(&input_file)?; let token_rankings = global_bow.token_rankings(); + let index_start = Instant::now(); let vector_of_indices_plus_min_max = index_builder( &input_file, &token_rankings, @@ -192,6 +203,8 @@ pub fn run( threshold, &function_paths_and_lengths, )?; + let index_duration = index_start.elapsed(); + info!("Index building took: {:.2}s", index_duration.as_secs_f64()); // Maximum and minimum 'words' in input file let vector_of_indices = vector_of_indices_plus_min_max.0; let min_words = vector_of_indices_plus_min_max.1 .0; @@ -238,16 +251,110 @@ pub fn run( } //go through input file again? Means i can grab 'words' from the file. Could do something like just checking candidates + let verify_start = Instant::now(); let clone_map = detect_clones( &token_rankings, &vector_of_indices, threshold, &function_paths_and_lengths, )?; + let verify_duration = verify_start.elapsed(); + info!("Verification took: {:.2}s", verify_duration.as_secs_f64()); + + // Ok() is kept at the end of the function + // Prepare CSV outputs: map of clone -> origin, and unique files list + let default_output_path: String = format!("{input_path}.unique.csv"); + let default_map_path: String = format!("{input_path}.duplicates_map.csv"); + let output_path: &str = _output_path.unwrap_or(&default_output_path); + let map_path: &str = _map_path.unwrap_or(&default_map_path); + + // Build clone -> origin mapping + let mut clone_to_origin: HashMap = HashMap::new(); + for (k, v) in clone_map.iter() { + match v { + Either::Left(set) => { + for c in set.iter() { + clone_to_origin.insert(*c, *k); + } + } + Either::Right(origin) => { + clone_to_origin.insert(*k, *origin); + } + } + } + + // Map CSV rows + let mut map_names: Vec = Vec::new(); + let mut map_originals: Vec = Vec::new(); + for (clone_hash, origin_hash) in clone_to_origin.iter() { + let clone_path = function_paths_and_lengths + .get(clone_hash) + .map(|(p, _)| *p) + .unwrap_or("Unknown") + .to_string(); + let origin_path = function_paths_and_lengths + .get(origin_hash) + .map(|(p, _)| *p) + .unwrap_or("Unknown") + .to_string(); + map_names.push(clone_path); + map_originals.push(origin_path); + } + + let mut map_df = DataFrame::new(vec![ + polars::prelude::Column::new("name".into(), map_names), + polars::prelude::Column::new("original".into(), map_originals), + ])?; + + // Unique files: those that are not listed as clones + let mut unique_paths: Vec = Vec::new(); + let mut unique_words: Vec = Vec::new(); + for (hash, (path, words)) in function_paths_and_lengths.iter() { + if !clone_to_origin.contains_key(hash) { + unique_paths.push(path.to_string()); + unique_words.push(*words as u32); + } + } + + let mut output_df = DataFrame::new(vec![ + polars::prelude::Column::new("path".into(), unique_paths), + polars::prelude::Column::new("words".into(), unique_words), + ])?; + + // Check output files and write + log_output_file(output_path, false, force)?; + log_write_output(_logger, map_path, &mut map_df, false)?; + log_write_output(_logger, output_path, &mut output_df, false)?; + info!( - "Finished detecting clones. {} clones found.", - clone_map.len() + "Remaining files: {} / {:.2} %", //something is weird with the percentage calculation here. + n_functions_after_loc, + if n_functions_before_loc == 0 { + 0 + } else { + (n_functions_after_loc as f64 / n_functions_before_loc as f64 * 100.0) as usize + } ); + + let unique_files = output_df.height(); + let unique_file_percentage = (unique_files as f64 / n_functions_after_loc as f64) * 100.0; + + info!( + "Unique files: {} / {:.2} %", + unique_files, unique_file_percentage + ); + + let duplicate_files = n_functions_after_loc - unique_files; + let duplicate_file_percentage = (duplicate_files as f64 / n_functions_after_loc as f64) * 100.0; + + info!( + "Duplicate files: {} / {:.2} %", + duplicate_files, duplicate_file_percentage + ); + + let total_duration = total_start.elapsed(); + info!("Total runtime: {:.2}s", total_duration.as_secs_f64()); + Ok(()) } @@ -258,6 +365,11 @@ fn index_builder( threshold: f64, function_paths_and_lengths: &HashMap, ) -> Result<(Vec, (usize, usize))> { + info!("Building indices..."); + let index_progress = ProgressBar::new(function_paths_and_lengths.len() as u64); + index_progress.set_style( + indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%")?, + ); let word_matcher: Matcher = Matcher::words_matcher(); let mut vector_of_indices: Vec = Vec::new(); @@ -274,6 +386,7 @@ fn index_builder( .into_iter() .flatten() { + index_progress.inc(1); match load_file(path, 1024 * 1024 * 1024) { Ok(Ok(function_code)) => { let local_bow = word_matcher.bag_of_words(&function_code.to_ascii_lowercase()); @@ -326,6 +439,7 @@ fn index_builder( } } } + index_progress.finish(); info!("Finished building indices."); Ok((vector_of_indices, (min_words, max_words))) } @@ -378,8 +492,15 @@ fn detect_clones( let word_matcher: Matcher = Matcher::words_matcher(); let p_prefix = vector_of_indices.len(); + info!("Detecting clones"); + let detection_progress = ProgressBar::new(function_paths_and_lengths.len() as u64); + detection_progress.set_style( + indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%")?, + ); + for (path, origin_word_count) in function_paths_and_lengths.values() { debug!("-----------------------------------------------------------------------------"); + detection_progress.inc(1); // info!("Path: {}, Words: {}", path, origin_word_count); match load_file(path, 1024 * 1024 * 1024) { Ok(Ok(function_code)) => { @@ -398,7 +519,10 @@ fn detect_clones( let prefix_length = origin_word_count - ((*origin_word_count as f64) * threshold).round() as usize + 1; - + debug!( + "Prefix length: {}, origin word count: {}, threshold: {}", + prefix_length, origin_word_count, threshold + ); let init_prefix_end = weighted_prefix_end(&origin_vectored_bow, prefix_length); let mut filter_cost_vector: Vec = Vec::new(); filter_cost_vector.push(0); //cost of prefix scheme 1 is calculated from an empty prefix, so the initial cost is 0 @@ -419,14 +543,17 @@ fn detect_clones( && origin_token_position < origin_vectored_bow.len() { let token_tuple = origin_vectored_bow.get(origin_token_position).unwrap(); + //debug!("Processing token {} at position {} in the origin prefix vector for prefix scheme {}.", String::from_utf8_lossy(&token_tuple.0), origin_token_position, p); //loop through the prefix vector of the current scheme, for the first scheme this is just the original prefix vector, for the next schemes this includes additional tokens let is_new = origin_token_position + 1 == prefix_end; origin_cumulative_count += token_tuple.1; filter_cost += delta_filter_cost(token_tuple, vector_of_indices, p, is_new); + debug!("Origin ID {}, count {}, token {}, index {}, tok_pos {}, wordpos {}.", origin_function_id, token_tuple.1, String::from_utf8_lossy(&token_tuple.0), p, origin_token_position, origin_cumulative_count); for candidate in vector_of_indices[p - 1] .get(&token_tuple.0) .unwrap_or(&Vec::new()) { + debug!("Candidate ID {}, count {}, token {}, index {}, tok_pos {}, wordpos {}.", candidate.0, candidate.1, String::from_utf8_lossy(&token_tuple.0), p, candidate.2.0, candidate.2.1); /* if candidate_id_lt_origin_id(&candidate.0, &origin_function_id) { info!("DClone: SKIPPING candidate at path '{}' since it has a lower function ID than the origin.", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown")); continue; //skip candidates that have already been processed as origins @@ -443,6 +570,7 @@ fn detect_clones( if candidate_word_count < ((*origin_word_count as f64) * threshold).round() as usize { + debug!("DClone: SKIPPING candidate at path '{}' since its word count {} is below the threshold for clones with the origin ({} words, threshold {}).", function_paths_and_lengths.get(&candidate.0).map(|(path, _)| *path).unwrap_or("Unknown"), candidate_word_count, origin_word_count, threshold); continue; //skip candidates that are too small to reach the threshold } @@ -454,9 +582,11 @@ fn detect_clones( .round() as usize; let upper_bound = min( *origin_word_count - origin_cumulative_count, - candidate_word_count - last_token_seen_pos.1 + new_matches, //candidate.2.1 is the number of words seen up to and including this token including duplicates + candidate_word_count - last_token_seen_pos.1, //candidate.2.1 is the number of words seen up to and including this token including duplicates ); - if candidate_map.get_token_matches(&function_id) + upper_bound + if candidate_map.get_token_matches(&function_id) + + upper_bound + + new_matches >= current_threshold { candidate_map.add_pending_update( @@ -487,7 +617,7 @@ fn detect_clones( (origin_token_position, origin_cumulative_count), &mut candidate_map, &mut clone_map, - p_prefix, + p - 1, token_rankings, threshold, function_paths_and_lengths, @@ -497,6 +627,7 @@ fn detect_clones( //apply updates candidate_map.apply_pending_updates(function_paths_and_lengths); if p == p_prefix { + debug!("Best prefix scheme is {} with estimated total cost of {}, filter cost: {}, verification cost: {}. This is the last prefix scheme, moving on to verification phase.", p, total_cost_vector[p], filter_cost_vector[p], verification_cost_vector[p]); //return verify_candidates(candidate_map, path, function_code, p); verify_candidates( origin_function_id, @@ -525,7 +656,7 @@ fn detect_clones( } } } - + detection_progress.finish(); Ok(clone_map) } @@ -609,7 +740,7 @@ fn verify_candidates( let mut new_matches = 0usize; let prefix_matches = candidate_map.get_token_matches(&candidate_id); while origin_last_token_seen_pos.0 < origin_token_count - && candidate_last_token_seen_pos.0 < candidate_token_count + && candidate_last_token_seen_pos.0 + 1 < candidate_token_count { let upper_bound = min( origin_word_count - origin_last_token_seen_pos.1, @@ -618,7 +749,12 @@ fn verify_candidates( let current_matches = prefix_matches + new_matches; let origin_token_tuple = &origin_vectored_bow[origin_last_token_seen_pos.0]; let candidate_token_tuple = - &vectored_candidate_bow[candidate_last_token_seen_pos.0]; + &vectored_candidate_bow[candidate_last_token_seen_pos.0 + 1]; + + let candidate_current_token_pos = ( + candidate_last_token_seen_pos.0 + 1, + candidate_last_token_seen_pos.1 + candidate_token_tuple.1, + ); debug!("Current threshold: {}", current_threshold); debug!( @@ -643,7 +779,7 @@ fn verify_candidates( origin_last_token_seen_pos.0, String::from_utf8_lossy(&candidate_token_tuple.0), candidate_rank, - candidate_last_token_seen_pos.0 + candidate_current_token_pos.0 ); if current_matches >= current_threshold { @@ -658,15 +794,13 @@ fn verify_candidates( //it's a match debug!("MATCHING!"); new_matches += min(origin_token_tuple.1, candidate_token_tuple.1); - candidate_last_token_seen_pos.0 += 1; - candidate_last_token_seen_pos.1 += candidate_token_tuple.1; + candidate_last_token_seen_pos = candidate_current_token_pos; origin_last_token_seen_pos.0 += 1; origin_last_token_seen_pos.1 += origin_token_tuple.1; } else if origin_rank > candidate_rank { //origin token is more frequent than candidate token, so we move in the candidate vector debug!("origin_count > candidate_count"); - candidate_last_token_seen_pos.0 += 1; - candidate_last_token_seen_pos.1 += origin_token_tuple.1; + candidate_last_token_seen_pos = candidate_current_token_pos; } else { //candidate token is more frequent than origin token, so we move in the origin vector debug!("candidate_count > origin_count"); diff --git a/tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java new file mode 100644 index 0000000..71d2783 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java @@ -0,0 +1,4 @@ +//Code Block 1 (CB1) +public static int factorial(int result) { + if(result <= 1) return 1; + return result * factorial(result-1); } \ No newline at end of file diff --git a/tests/data/phases/type_3_duplicate_files/identical_functions.csv b/tests/data/phases/type_3_duplicate_files/identical_functions.csv new file mode 100644 index 0000000..3e9eee0 --- /dev/null +++ b/tests/data/phases/type_3_duplicate_files/identical_functions.csv @@ -0,0 +1,3 @@ +id,path,name,position,language,loc,words,tests/data/phases/type_3_duplicate_files/java.json,loop_statements,loop_nestings,if_statements,if_nestings,functions_calls,function_calls_nestings,params,param_kw_match,parse_error +0,tests/data/phases/type_3_duplicate_files/files/java_examples/CB1/factorial.java.functions/2-1,factorial,2:1,java,3,16,16,0,0,1,1,1,1,1,1,none +1,tests/data/phases/type_3_duplicate_files/files/java_examples/CB6/factorial.java.functions/2-1,factorial,2:1,java,3,16,16,0,0,1,1,1,1,1,1,none \ No newline at end of file