diff --git a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/bourreau/simple_file_extractor.rb b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/bourreau/simple_file_extractor.rb
index 08d69908e..05bc006d4 100644
--- a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/bourreau/simple_file_extractor.rb
+++ b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/bourreau/simple_file_extractor.rb
@@ -20,6 +20,8 @@
# along with this program. If not, see .
#
+require "fileutils"
+
# A subclass of CbrainTask::ClusterTask to run SimpleFileExtractor.
class CbrainTask::SimpleFileExtractor < ClusterTask
@@ -94,9 +96,15 @@ def save_results #:nodoc:
ids = params[:interface_userfile_ids]
# Main inputs
- patterns = patterns_as_array(params[:patterns].presence || {})
+
file_cols = FileCollection.where(:id => ids).to_a
+ patterns, repls, folds = patterns_as_arrays(
+ params[:patterns].presence || {},
+ params[:replace_paths].presence || {},
+ params[:folders].presence || {}
+ )
+
# Error and warning helpers
error_examples = {}
error_counts = {}
@@ -127,12 +135,20 @@ def save_results #:nodoc:
cache_path = userfile.cache_full_path
parent_cpath = cache_path.parent
patterns.each_with_index do |pat,patidx|
- pat = pat.dup
- pat = Pathname.new(pat).cleanpath
+
+ pat = pat.dup
+ pat = Pathname.new(pat).cleanpath
+ rep = repls[patidx]&.dup
+ rep = Pathname.new(rep) if rep
+ fold = folds[patidx].dup
# Replace "*/" at the beginning of a pattern with "userfilename/"
# This is just an optimization for flat dir DPs, removing one
# unneccesary level of globbing
+ if rep.present?
+ pat_orig = pat.to_s.dup
+ regexp = glob_to_regex(pat_orig.to_s) # to use gsub - potentially maybe allow use globe pattern or regex on will
+ end
if pat.to_s.starts_with?("*/")
pat = pat.to_s
pat[0] = userfile.name # replaces the *
@@ -140,8 +156,9 @@ def save_results #:nodoc:
end
# Quick safety check just like in after_form on portal side
- cb_error "Wrong pattern encountered: #{pat}" if
- (! pat.relative?) || (! pat.to_s.index('/')) || (pat.to_s.start_with? "../")
+ cb_error "Wrong pattern encountered: #{pat}" if (! pat.relative?) || (! pat.to_s.index('/')) || (pat.to_s.start_with? "../")
+ cb_error "Wrong replacement pattern: #{rep}" if rep && ( (! rep.relative?) || (rep.to_s.start_with? "../") )
+
path_pattern = parent_cpath + pat
globbed_paths=Dir.glob(path_pattern.to_s)
if globbed_paths.empty?
@@ -154,27 +171,45 @@ def save_results #:nodoc:
log_it.("Globbing through missing filesystem entries", pat, userfile, filepath)
next
end
+ if rep.present?
+ relpath = (Pathname.new filepath).relative_path_from(File.realpath(Pathname.new parent_cpath)) # new path
+ target = "extracted/" + relpath.to_s.gsub(regexp, rep.to_s)
+ else
+ basename = File.basename(filepath)
+ target = "extracted/#{basename}"
+ end
if ! filepath.start_with?(cache_path.to_s)
log_it.("Extraction outside collection", pat, userfile, filepath)
next
end
+ if ! (Pathname.new target).cleanpath.to_s.start_with?("extracted/")
+ log_it.("Probably bad renaming pattern", pat_orig.to_s + ' --- ' + rep.to_s, userfile, filepath)
+ next
+ end
if File.symlink?(filepath)
- log_it.("Trying to extract a symbolic link", pat, userfile, filepath)
+ log_it.("Trying to extract a symbolic link", pat_orig, userfile, filepath)
next
end
- if ! File.file?(filepath)
- log_it.("Trying to extract a non regular file", pat, userfile, filepath)
+ if fold == "0" && ! File.file?(filepath)
+ log_it.("Trying to extract a non regular file", pat_orig, userfile, filepath)
next
end
- basename = File.basename(filepath)
- if File.file?("extracted/#{basename}")
+
+ if File.exist?(target)
log_it.("Trying to extract a file with a name matching something already extracted", pat, userfile, filepath)
next
end
+ unless rep.to_s.blank?
+ #makesure path exists
+ dir = File.dirname(target)
+ FileUtils.mkdir_p(dir)
+ end
+
# Make the copy
- system "cp", "#{filepath}", "extracted/#{basename}" # no .bash_escape because no bash subshell
+ system "cp", "-rn", "#{filepath}", target # no .bash_escape because no bash subshell
status = $? # a Process::Status object
+ basename = File.basename(filepath)
if status.signaled?
self.addlog("Error copying file '#{basename}': got signal #{status.termsig || 'unknown'}. This is fatal.")
return false
@@ -231,4 +266,3 @@ def save_results #:nodoc:
# friends, described in the CbrainTask Programmer Guide.
end
-
diff --git a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/common/simple_file_extractor.rb b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/common/simple_file_extractor.rb
index e89de3369..17171cca4 100644
--- a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/common/simple_file_extractor.rb
+++ b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/common/simple_file_extractor.rb
@@ -23,25 +23,86 @@
# Model code common to the Bourreau and Portal side for SimpleFileExtractor.
class CbrainTask::SimpleFileExtractor
- # In the params, the list of patterns is maintained as a hash:
- # { "0" => "pat1", "1" => "pat2", etc }
- # This returns just the array of values, while preserving the ordering
- # that the keys encode:
- # [ "pat1", "pat2" etc ]
- def patterns_as_array(pat_hash)
- keys = pat_hash.keys.sort { |a,b| a.to_i <=> b.to_i }
- pat_array = keys.map { |i| pat_hash[i].presence }.compact
- pat_array
+ # In the params, patterns, replacement path, folder flags are maintained as a hash:
+ # # { "0" => ["*/pat1"], "1" => "*/pat2", etc }, {"0": "subfolder1", "1" => nil...}
+ # # { "0" => ["*/pat1"], "1" => "*/pat2", etc }, {"0": "subfolder1", "1" => nil...}
+ # This method convert three hashes just the array of values, while preserving the ordering
+ # that the keys encode, and skipping empty rows:
+ # [ ["pat1", "pat2"], etc ]
+ # Usually some indexes with no info in either category
+ def patterns_as_arrays(pat_hash, repl_hash, fold_hash)
+ keys = pat_hash.keys.sort_by(&:to_i)
+ pat_array = keys.map do |i|
+ [
+ pat_hash[i]&.strip.presence,
+ repl_hash[i]&.strip.presence,
+ fold_hash[i]
+ ]
+ end.select { |x, y, z| x || y || z == "1" } # filter out blank rows
+ return pat_array.transpose
end
- # This does the opposite of patterns_as_array; given
- # an array of patterns, returns a hash where the keys are
+ # This allows perform the opposite of patterns_as_array; given
+ # an array of patterns, path, or flags , returns array of hash where the keys are
# the index of the array
- def patterns_as_hash(pat_array)
- pat_hash = {}
- pat_array.each_with_index { |pat,i| pat_hash[i.to_s] = pat }
- pat_hash
+ # Hash it returns has with array indexes as values (stringifierd)
+ #
+ def array_to_hash(arr)
+ hsh = arr.map.with_index { |pat, i| [i.to_s, pat] }.to_h
+ hsh
end
-end
+ # best effort mapping of a glob pattern to regex (with groups)
+ # https://stackoverflow.com/questions/1307712/how-to-convert-glob-to-regular-expression
+ def glob_to_regex(glob)
+ escaped = ''
+ i = 0
+ while i < glob.length
+ char = glob[i]
+
+ case char
+ when '*'
+ # Check for ** (recursive)
+ if glob[i, 2] == '**'
+ escaped << '(.+?)' # non-greedy match across directories
+ i += 1
+ else
+ escaped << '([^/]+)' # * matches a single path segment
+ end
+ when '?'
+ escaped << '(.)'
+ when '['
+ # Copy character class literally until closing ]
+ j = i + 1
+ while j < glob.length && glob[j] != ']'
+ j += 1
+ end
+ char_class = glob[i..j] # include the closing ]
+ escaped << char_class
+ i = j
+ when '{'
+ # Convert {a,b,c} → (a|b|c)
+ j = i + 1
+ brace_content = ''
+ depth = 1
+ while j < glob.length && depth > 0
+ if glob[j] == '{'
+ depth += 1
+ elsif glob[j] == '}'
+ depth -= 1
+ end
+ brace_content << glob[j] if depth > 0
+ j += 1
+ end
+ alternatives = brace_content.split(',').map { |x| Regexp.escape(x) }.join('|')
+ escaped << "(#{alternatives})"
+ i = j - 1
+ else
+ escaped << Regexp.escape(char) # escape other character
+ end
+ i += 1
+ end
+ Regexp.new("\\A#{escaped}\\z")
+ end
+end
diff --git a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/portal/simple_file_extractor.rb b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/portal/simple_file_extractor.rb
index fee3cb963..2de138538 100644
--- a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/portal/simple_file_extractor.rb
+++ b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/portal/simple_file_extractor.rb
@@ -29,7 +29,11 @@ class CbrainTask::SimpleFileExtractor < PortalTask
# is created with #:nodoc: in this template.
def self.default_launch_args #:nodoc:
{
- :patterns => {}, # keys are numeric, values are the patterns
+ # keys are numeric, values are the patterns
+ :patterns => {},
+ :replace_paths => {},
+ # values are flags
+ :folders => Hash.new("0".freeze)
}
end
@@ -53,10 +57,19 @@ def after_form #:nodoc:
FileCollection.is_legal_filename?(out_name)
# Clean up pattern list
- patterns = patterns_as_array(params[:patterns].presence || {})
- patterns = patterns.map(&:presence).compact.map(&:strip).map(&:presence).compact # ignore blanks at each end
- patterns = patterns.map { |pat| Pathname.new(pat).cleanpath }
- params[:patterns] = patterns_as_hash(patterns.map(&:to_s)) # write back cleaned list
+
+ patterns, repls, folds = patterns_as_arrays(
+ params[:patterns].presence || {},
+ params[:replace_paths].presence || {},
+ params[:folders].presence || {}
+ )
+
+ patterns = patterns.map { |pat| Pathname.new(pat).cleanpath.to_s if pat}
+ repls = repls.map { |pat| Pathname.new(pat).cleanpath.to_s if pat }
+
+ params[:patterns] = array_to_hash(patterns)
+ params[:replace_paths] = array_to_hash(repls)
+ params[:folders] = array_to_hash(folds)
# Validate them and report errors; note that here the array contains Pathname objects
#
@@ -67,7 +80,26 @@ def after_form #:nodoc:
# */subdir/*/*.txt
# FileColName*/*/*.txt
patterns.each_with_index do |pat,idx|
- if ! pat.relative?
+
+ rep = repls[idx]
+ fld = folds[idx]
+
+ if rep.present? && pat.blank?
+ self.params_errors.add("replace_paths[#{idx}]", "replacement path cannot be provided without pattern")
+ end
+ if fld.to_s == "1" && pat.blank?
+ self.params_errors.add("folders[#{idx}]", "folder extraction flag cannot be set without pattern")
+ end
+ if rep.present? && ! Pathname.new(rep).relative?
+ self.params_errors.add("replace_path[#{idx}]", "is not a relative path")
+ end
+ if rep.to_s.start_with? "../"
+ self.params_errors.add("replace_path[#{idx}]", "cannot map outside of collections")
+ end
+
+ next if pat.blank? # shortcut for pattern validation if it is not present
+
+ if pat && ! Pathname.new(pat).relative?
self.params_errors.add("patterns[#{idx}]", "is not a relative path")
end
if ! pat.to_s.index('/') # must contain at least 2 components
@@ -116,4 +148,3 @@ def validate_input_ids(ids)
end
end
-
diff --git a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/views/_task_params.html.erb b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/views/_task_params.html.erb
index 4109c1e63..64b8c7cd6 100644
--- a/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/views/_task_params.html.erb
+++ b/BrainPortal/cbrain_plugins/cbrain-plugins-base/cbrain_task/simple_file_extractor/views/_task_params.html.erb
@@ -37,9 +37,15 @@