diff --git a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/bourreau/simple_file_extractor.rb b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/bourreau/simple_file_extractor.rb index 08d69908e..05bc006d4 100644 --- a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/bourreau/simple_file_extractor.rb +++ b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/bourreau/simple_file_extractor.rb @@ -20,6 +20,8 @@ # along with this program. If not, see . # +require "fileutils" + # A subclass of CbrainTask::ClusterTask to run SimpleFileExtractor. class CbrainTask::SimpleFileExtractor < ClusterTask @@ -94,9 +96,15 @@ def save_results #:nodoc: ids = params[:interface_userfile_ids] # Main inputs - patterns = patterns_as_array(params[:patterns].presence || {}) + file_cols = FileCollection.where(:id => ids).to_a + patterns, repls, folds = patterns_as_arrays( + params[:patterns].presence || {}, + params[:replace_paths].presence || {}, + params[:folders].presence || {} + ) + # Error and warning helpers error_examples = {} error_counts = {} @@ -127,12 +135,20 @@ def save_results #:nodoc: cache_path = userfile.cache_full_path parent_cpath = cache_path.parent patterns.each_with_index do |pat,patidx| - pat = pat.dup - pat = Pathname.new(pat).cleanpath + + pat = pat.dup + pat = Pathname.new(pat).cleanpath + rep = repls[patidx]&.dup + rep = Pathname.new(rep) if rep + fold = folds[patidx].dup # Replace "*/" at the beginning of a pattern with "userfilename/" # This is just an optimization for flat dir DPs, removing one # unneccesary level of globbing + if rep.present? + pat_orig = pat.to_s.dup + regexp = glob_to_regex(pat_orig.to_s) # to use gsub - potentially maybe allow use globe pattern or regex on will + end if pat.to_s.starts_with?("*/") pat = pat.to_s pat[0] = userfile.name # replaces the * @@ -140,8 +156,9 @@ def save_results #:nodoc: end # Quick safety check just like in after_form on portal side - cb_error "Wrong pattern encountered: #{pat}" if - (! pat.relative?) || (! pat.to_s.index('/')) || (pat.to_s.start_with? "../") + cb_error "Wrong pattern encountered: #{pat}" if (! pat.relative?) || (! pat.to_s.index('/')) || (pat.to_s.start_with? "../") + cb_error "Wrong replacement pattern: #{rep}" if rep && ( (! rep.relative?) || (rep.to_s.start_with? "../") ) + path_pattern = parent_cpath + pat globbed_paths=Dir.glob(path_pattern.to_s) if globbed_paths.empty? @@ -154,27 +171,45 @@ def save_results #:nodoc: log_it.("Globbing through missing filesystem entries", pat, userfile, filepath) next end + if rep.present? + relpath = (Pathname.new filepath).relative_path_from(File.realpath(Pathname.new parent_cpath)) # new path + target = "extracted/" + relpath.to_s.gsub(regexp, rep.to_s) + else + basename = File.basename(filepath) + target = "extracted/#{basename}" + end if ! filepath.start_with?(cache_path.to_s) log_it.("Extraction outside collection", pat, userfile, filepath) next end + if ! (Pathname.new target).cleanpath.to_s.start_with?("extracted/") + log_it.("Probably bad renaming pattern", pat_orig.to_s + ' --- ' + rep.to_s, userfile, filepath) + next + end if File.symlink?(filepath) - log_it.("Trying to extract a symbolic link", pat, userfile, filepath) + log_it.("Trying to extract a symbolic link", pat_orig, userfile, filepath) next end - if ! File.file?(filepath) - log_it.("Trying to extract a non regular file", pat, userfile, filepath) + if fold == "0" && ! File.file?(filepath) + log_it.("Trying to extract a non regular file", pat_orig, userfile, filepath) next end - basename = File.basename(filepath) - if File.file?("extracted/#{basename}") + + if File.exist?(target) log_it.("Trying to extract a file with a name matching something already extracted", pat, userfile, filepath) next end + unless rep.to_s.blank? + #makesure path exists + dir = File.dirname(target) + FileUtils.mkdir_p(dir) + end + # Make the copy - system "cp", "#{filepath}", "extracted/#{basename}" # no .bash_escape because no bash subshell + system "cp", "-rn", "#{filepath}", target # no .bash_escape because no bash subshell status = $? # a Process::Status object + basename = File.basename(filepath) if status.signaled? self.addlog("Error copying file '#{basename}': got signal #{status.termsig || 'unknown'}. This is fatal.") return false @@ -231,4 +266,3 @@ def save_results #:nodoc: # friends, described in the CbrainTask Programmer Guide. end - diff --git a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/common/simple_file_extractor.rb b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/common/simple_file_extractor.rb index e89de3369..17171cca4 100644 --- a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/common/simple_file_extractor.rb +++ b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/common/simple_file_extractor.rb @@ -23,25 +23,86 @@ # Model code common to the Bourreau and Portal side for SimpleFileExtractor. class CbrainTask::SimpleFileExtractor - # In the params, the list of patterns is maintained as a hash: - # { "0" => "pat1", "1" => "pat2", etc } - # This returns just the array of values, while preserving the ordering - # that the keys encode: - # [ "pat1", "pat2" etc ] - def patterns_as_array(pat_hash) - keys = pat_hash.keys.sort { |a,b| a.to_i <=> b.to_i } - pat_array = keys.map { |i| pat_hash[i].presence }.compact - pat_array + # In the params, patterns, replacement path, folder flags are maintained as a hash: + # # { "0" => ["*/pat1"], "1" => "*/pat2", etc }, {"0": "subfolder1", "1" => nil...} + # # { "0" => ["*/pat1"], "1" => "*/pat2", etc }, {"0": "subfolder1", "1" => nil...} + # This method convert three hashes just the array of values, while preserving the ordering + # that the keys encode, and skipping empty rows: + # [ ["pat1", "pat2"], etc ] + # Usually some indexes with no info in either category + def patterns_as_arrays(pat_hash, repl_hash, fold_hash) + keys = pat_hash.keys.sort_by(&:to_i) + pat_array = keys.map do |i| + [ + pat_hash[i]&.strip.presence, + repl_hash[i]&.strip.presence, + fold_hash[i] + ] + end.select { |x, y, z| x || y || z == "1" } # filter out blank rows + return pat_array.transpose end - # This does the opposite of patterns_as_array; given - # an array of patterns, returns a hash where the keys are + # This allows perform the opposite of patterns_as_array; given + # an array of patterns, path, or flags , returns array of hash where the keys are # the index of the array - def patterns_as_hash(pat_array) - pat_hash = {} - pat_array.each_with_index { |pat,i| pat_hash[i.to_s] = pat } - pat_hash + # Hash it returns has with array indexes as values (stringifierd) + # + def array_to_hash(arr) + hsh = arr.map.with_index { |pat, i| [i.to_s, pat] }.to_h + hsh end -end + # best effort mapping of a glob pattern to regex (with groups) + # https://stackoverflow.com/questions/1307712/how-to-convert-glob-to-regular-expression + def glob_to_regex(glob) + escaped = '' + i = 0 + while i < glob.length + char = glob[i] + + case char + when '*' + # Check for ** (recursive) + if glob[i, 2] == '**' + escaped << '(.+?)' # non-greedy match across directories + i += 1 + else + escaped << '([^/]+)' # * matches a single path segment + end + when '?' + escaped << '(.)' + when '[' + # Copy character class literally until closing ] + j = i + 1 + while j < glob.length && glob[j] != ']' + j += 1 + end + char_class = glob[i..j] # include the closing ] + escaped << char_class + i = j + when '{' + # Convert {a,b,c} → (a|b|c) + j = i + 1 + brace_content = '' + depth = 1 + while j < glob.length && depth > 0 + if glob[j] == '{' + depth += 1 + elsif glob[j] == '}' + depth -= 1 + end + brace_content << glob[j] if depth > 0 + j += 1 + end + alternatives = brace_content.split(',').map { |x| Regexp.escape(x) }.join('|') + escaped << "(#{alternatives})" + i = j - 1 + else + escaped << Regexp.escape(char) # escape other character + end + i += 1 + end + Regexp.new("\\A#{escaped}\\z") + end +end diff --git a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/portal/simple_file_extractor.rb b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/portal/simple_file_extractor.rb index fee3cb963..2de138538 100644 --- a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/portal/simple_file_extractor.rb +++ b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/portal/simple_file_extractor.rb @@ -29,7 +29,11 @@ class CbrainTask::SimpleFileExtractor < PortalTask # is created with #:nodoc: in this template. def self.default_launch_args #:nodoc: { - :patterns => {}, # keys are numeric, values are the patterns + # keys are numeric, values are the patterns + :patterns => {}, + :replace_paths => {}, + # values are flags + :folders => Hash.new("0".freeze) } end @@ -53,10 +57,19 @@ def after_form #:nodoc: FileCollection.is_legal_filename?(out_name) # Clean up pattern list - patterns = patterns_as_array(params[:patterns].presence || {}) - patterns = patterns.map(&:presence).compact.map(&:strip).map(&:presence).compact # ignore blanks at each end - patterns = patterns.map { |pat| Pathname.new(pat).cleanpath } - params[:patterns] = patterns_as_hash(patterns.map(&:to_s)) # write back cleaned list + + patterns, repls, folds = patterns_as_arrays( + params[:patterns].presence || {}, + params[:replace_paths].presence || {}, + params[:folders].presence || {} + ) + + patterns = patterns.map { |pat| Pathname.new(pat).cleanpath.to_s if pat} + repls = repls.map { |pat| Pathname.new(pat).cleanpath.to_s if pat } + + params[:patterns] = array_to_hash(patterns) + params[:replace_paths] = array_to_hash(repls) + params[:folders] = array_to_hash(folds) # Validate them and report errors; note that here the array contains Pathname objects # @@ -67,7 +80,26 @@ def after_form #:nodoc: # */subdir/*/*.txt # FileColName*/*/*.txt patterns.each_with_index do |pat,idx| - if ! pat.relative? + + rep = repls[idx] + fld = folds[idx] + + if rep.present? && pat.blank? + self.params_errors.add("replace_paths[#{idx}]", "replacement path cannot be provided without pattern") + end + if fld.to_s == "1" && pat.blank? + self.params_errors.add("folders[#{idx}]", "folder extraction flag cannot be set without pattern") + end + if rep.present? && ! Pathname.new(rep).relative? + self.params_errors.add("replace_path[#{idx}]", "is not a relative path") + end + if rep.to_s.start_with? "../" + self.params_errors.add("replace_path[#{idx}]", "cannot map outside of collections") + end + + next if pat.blank? # shortcut for pattern validation if it is not present + + if pat && ! Pathname.new(pat).relative? self.params_errors.add("patterns[#{idx}]", "is not a relative path") end if ! pat.to_s.index('/') # must contain at least 2 components @@ -116,4 +148,3 @@ def validate_input_ids(ids) end end - diff --git a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/views/_task_params.html.erb b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/views/_task_params.html.erb index 4109c1e63..64b8c7cd6 100644 --- a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/views/_task_params.html.erb +++ b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/views/_task_params.html.erb @@ -37,9 +37,15 @@
- Extraction patterns: + Patterns to match and place files <% 10.times do |i| %> - <%= form.params_text_field "patterns[#{i}]", :size => 120 %>

+ <%= form.label "patterns[#{i}]", "Match pattern # #{i}"%> + <%= form.params_text_field "patterns[#{i}]", :size => 100 %>

+ <%= form.label "replace_paths[#{i}]", "Substitute" %> + <%= form.params_text_field "replace_paths[#{i}]", :size => 100 %>

+ <%= form.label "folders[#{i}]", "Extract folders?" %> + <%= form.params_check_box "folders[#{i}]" %>

+


<% end %>

About these patterns: @@ -62,6 +68,10 @@ Typically, all patterns will start with */ because the first component must match the names of the FileCollections themselves.

+ If you like you can provide a replace pattern, that can include \1, \2, to reference the 1rst, 2nd, etc. matched + wildcards (* or ?). + . \+ stands for the last and \0 stands for the entire path + See the 'help' link above for more information and examples.

@@ -70,4 +80,3 @@ Output name <%= form.params_text_field :output_file_name, :size => 40 %> -