66use IPC::Cmd qw[ can_run run] ;
77use Parallel::Loops;
88
9- my $version = " 0.1 " ;
9+ my $version = " 0.2 " ;
1010
1111my $busco_dir = " " ;
1212my $full_table = " " ;
2828my @error_args = ();
2929my $length = 0;
3030my %genome_ann ;
31+ my $busco_version ;
32+ my $busco_major ;
3133
3234my $input_error = 0;
3335
@@ -36,17 +38,14 @@ sub print_help{
3638 print STDOUT " busco2snap.pl v$version \n " ;
3739 print STDOUT " \n " ;
3840 print STDOUT " Description:\n " ;
39- print STDOUT " \t Creation of a SNAP model from BUSCOs by predicting the gene structure with the retrained\n\t Augustus model, converting to zff format. The tools augustus and blastp need to be in\n\t your \$ PATH. If fathom, forge and hmm-assembler.pl are in your \$ PATH the SNAP model will be\n\t created automatically.\n " ;
40- print STDOUT " \t If Augustus predicts more than one gene at a BUSCO locus the predicted proteins are blasted\n\t against the ancestral variants of the corresponding BUSCO lineage. The gene(s) where the best\n\t hit (defined by highest bit score) is on the searched BUSCO are used only.\n " ;
41+ print STDOUT " \t Creation of a SNAP model from BUSCOs by predicting the gene structure with the retrained\n\t augustus model, converting to zff format. The tools augustus and blastp need to be in\n\t your \$ PATH. If fathom, forge and hmm-assembler are in your \$ PATH the SNAP model will be\n\t created automatically.\n " ;
42+ print STDOUT " \t If augustus predicts more than one gene at a BUSCO locus the predicted proteins are blasted\n\t against the ancestral variants of the corresponding BUSCO lineage. The gene(s) where the best\n\t hit (defined by highest bit score) is on the searched BUSCO are used only.\n " ;
4143 print STDOUT " \n " ;
4244 print STDOUT " Usage:\n " ;
43- print STDOUT " \t busco2snap.pl [ -b <busco_dir> | -ft <full_table> -rp <retraining_parameters>] \n " ;
45+ print STDOUT " \t busco2snap.pl -b <busco_dir>\n " ;
4446 print STDOUT " \n " ;
4547 print STDOUT " Mandatory:\n " ;
4648 print STDOUT " \t -b STR\t\t BUSCO output directory (run_*)\n " ;
47- print STDOUT " \t OR\n " ;
48- print STDOUT " \t -ft STR\t\t BUSCOs full table file (full_table_*)\n " ;
49- print STDOUT " \t -rp STR\t\t Directory containing retrining parameters from Augustus\n\t\t\t (augustus_output/retraining_parameters/)\n " ;
5049 print STDOUT " \n " ;
5150 print STDOUT " Options: [default]\n " ;
5251 print STDOUT " \t -dup\t\t Include duplicated BUSCOs [off]\n " ;
@@ -82,7 +81,7 @@ sub empty_element{
8281
8382sub check_rp{
8483 my ($rp_dir ) = @_ ;
85- opendir (DIR, $rp_dir ) or die " ERROR\t could not open directory $rp_dir \n " ;
84+ opendir (DIR, $rp_dir ) or die " ERROR\t Could not open directory $rp_dir \n " ;
8685
8786 my $base = " " ;
8887 while (my $file = readdir (DIR)){
@@ -108,7 +107,7 @@ sub check_rp{
108107 next ;
109108 }
110109 else {
111- print STDERR " ERROR\t Unrecognized Augustus model part $file in $rp_dir \n " ;
110+ print STDERR " ERROR\t Unrecognized augustus model part $file in $rp_dir \n " ;
112111 $input_error = 1;
113112 }
114113 }
@@ -133,7 +132,7 @@ sub check_rp{
133132 next ;
134133 }
135134 else {
136- print STDERR " ERROR\t Unrecognized Augustus model part $file in $rp_dir \n " ;
135+ print STDERR " ERROR\t Unrecognized augustus model part $file in $rp_dir \n " ;
137136 $input_error = 1;
138137 }
139138 }
@@ -149,14 +148,6 @@ sub check_rp{
149148 $busco_dir = abs_path($ARGV [$i +1]);
150149 empty_element($i ,2);
151150 }
152- if ($ARGV [$i ] eq " -ft" ){
153- $full_table = abs_path($ARGV [$i +1]);
154- empty_element($i ,2);
155- }
156- if ($ARGV [$i ] eq " -rp" ){
157- $retraining_parameters = abs_path($ARGV [$i +1]);
158- empty_element($i ,2);
159- }
160151 if ($ARGV [$i ] eq " -dup" ){
161152 $duplicated = 1;
162153 empty_element($i ,1);
@@ -210,17 +201,25 @@ sub check_rp{
210201 $input_error = 1;
211202}
212203
213- if ($full_table eq " " and $retraining_parameters eq " " and $busco_dir ne " " ){
214-
215- $full_table = ` find $busco_dir -mindepth 1 -maxdepth 1 -type f -name "full_table_*"` ;
204+ if ($busco_dir ne " " ){
205+ $full_table = ` find $busco_dir -mindepth 1 -maxdepth 1 -type f -name "full_table*"` ;
216206 chomp $full_table ;
217207 if ($full_table eq " " ){
218208 print STDERR " ERROR\t full_table not found\n " ;
219209 $input_error = 1;
220210 }
211+ $busco_version = ` grep "BUSCO version" $full_table ` ;
212+ chomp $busco_version ;
213+ $busco_version = (split (/ / ,$busco_version ))[-1];
214+ $busco_major = (split (/ \. / ,$busco_version ))[0];
215+ print STDERR " INFO\t Detected BUSCO version $busco_version \n " ;
221216
222217 $retraining_parameters = ` find $busco_dir -mindepth 2 -maxdepth 2 -type d -name "retraining_parameters"` ;
223218 chomp $retraining_parameters ;
219+ if ($busco_major == 4 or $busco_major == 5){
220+ $retraining_parameters = ` find $retraining_parameters -mindepth 1 -maxdepth 1 -type d -name "BUSCO_*"` ;
221+ chomp $retraining_parameters ;
222+ }
224223 if ($retraining_parameters eq " " ){
225224 print STDERR " ERROR\t retraining_parameters not found\n " ;
226225 $input_error = 1;
@@ -230,29 +229,8 @@ sub check_rp{
230229 }
231230}
232231else {
233- if ($full_table eq " " ){
234- print STDERR " ERROR\t -ft not specified\n " ;
235- $input_error = 1;
236- }
237- else {
238- if (not -f $full_table ){
239- print STDERR " ERROR\t $full_table is not a file\n " ;
240- $input_error = 1;
241- }
242- }
243- if ($retraining_parameters eq " " ){
244- print STDERR " ERROR\t -rp not specified\n " ;
245- $input_error = 1;
246- }
247- else {
248- if (not -d $retraining_parameters ){
249- print STDERR " ERROR\t $retraining_parameters is not a directory\n " ;
250- $input_error = 1;
251- }
252- else {
253- $species = check_rp($retraining_parameters );
254- }
255- }
232+ print STDERR " ERROR\t -b must be specified\n " ;
233+ $input_error = 1;
256234}
257235
258236if ($threads !~ m / ^\d +$ / or $threads <= 0){
@@ -269,11 +247,21 @@ sub check_rp{
269247 print STDERR " ERROR\t augustus is not in your \$ PATH\n " ;
270248 $input_error = 1;
271249}
250+ else {
251+ my $augustus_version = ` augustus --version 2>&1 | head -n 1 | sed 's/.*(//;s/).*//'` ;
252+ chomp $augustus_version ;
253+ print STDERR " INFO\t Using augustus $augustus_version \n " ;
254+ }
272255
273256if (not defined (can_run(" blastp" ))){
274257 print STDERR " ERROR\t blastp is not in your \$ PATH\n " ;
275258 $input_error = 1;
276259}
260+ else {
261+ my $blastp_version = ` blastp -version | head -n 1 | sed 's/.*: //'` ;
262+ chomp $blastp_version ;
263+ print STDERR " INFO\t Using blastp $blastp_version \n " ;
264+ }
277265
278266if (not defined ($ENV {' AUGUSTUS_CONFIG_PATH' })){
279267 print STDERR " ERROR\t\$ AUGUSTUS_CONFIG_PATH needs to be set\n " ;
@@ -343,24 +331,48 @@ sub check_rp{
343331
344332print STDERR " INFO\t Extracting BUSCOs and their locations from $full_table \n " ;
345333
346- open (FT, ' <' , $full_table ) or die " ERROR\t could not open $full_table \n " ;
334+ if ($busco_major == 4 or $busco_major == 5){
335+ my $busco_log = abs_path(" $busco_dir /../logs/busco.log" );
336+ open (LOG, ' <' , $busco_log ) or die " ERROR\t Could not open $busco_log \n " ;
337+ while (my $line = <LOG>){
338+ chomp $line ;
339+ if ($line =~ m / _input_filepath/ ){
340+ $target_fasta = $line ;
341+ $target_fasta =~ s / ^.*': '// ;
342+ $target_fasta =~ s / ',$// ;
343+ }
344+ if ($line =~ m / lineage_dataset/ ){
345+ $lineage_path = $line ;
346+ $lineage_path =~ s / ^.*': '// ;
347+ $lineage_path =~ s / ',$// ;
348+ }
349+ }
350+ close LOG;
351+ }
352+
353+ open (FT, ' <' , $full_table ) or die " ERROR\t Could not open $full_table \n " ;
347354
348355while (my $line = <FT>){
349356 if ($line =~ m / ^#/ ){
350- if ($line =~ m / ^# To reproduce this run: / ){
351- my @run = split (/ / ,$line );
352- for (my $i = 0; $i < scalar (@run ); $i ++){
353- if ($run [$i ] eq " -i" ){
354- $target_fasta = $run [$i +1];
355- }
356- if ($run [$i ] eq " -l" ){
357- $lineage_path = $run [$i +1];
358- if (substr ($lineage_path ,-1) eq " /" ){
359- $lineage_path = substr ($lineage_path ,0,-1);
357+ if ($busco_major < 4){
358+ if ($line =~ m / ^# To reproduce this run: / ){
359+ my @run = split (/ / ,$line );
360+ for (my $i = 0; $i < scalar (@run ); $i ++){
361+ if ($run [$i ] eq " -i" ){
362+ $target_fasta = $run [$i +1];
363+ }
364+ if ($run [$i ] eq " -l" ){
365+ $lineage_path = $run [$i +1];
366+ if (substr ($lineage_path ,-1) eq " /" ){
367+ $lineage_path = substr ($lineage_path ,0,-1);
368+ }
360369 }
361370 }
371+ next ;
372+ }
373+ else {
374+ next ;
362375 }
363- next ;
364376 }
365377 else {
366378 next ;
@@ -412,7 +424,7 @@ sub check_rp{
412424my $fa_frac = 0;
413425
414426if ($dry == 0){
415- open (FA, ' <' , $target_fasta ) or die " ERROR\t could not open $target_fasta \n " ;
427+ open (FA, ' <' , $target_fasta ) or die " ERROR\t Could not open $target_fasta \n " ;
416428
417429 my $head = " " ;
418430 my $out_file = " " ;
@@ -435,7 +447,7 @@ sub check_rp{
435447
436448 $out_file = $head . " .fa" ;
437449 $out_file =~ tr / [\/><|:&\\,;?*]/ _/ ;
438- open (OUT, ' >' , " $out_dir /$out_file " ) or die " ERROR\t could not open $out_dir /$out_file \n " ;
450+ open (OUT, ' >' , " $out_dir /$out_file " ) or die " ERROR\t Could not open $out_dir /$out_file \n " ;
439451 push (@out_fa ," $out_dir /$out_file " );
440452 }
441453 }
@@ -461,6 +473,11 @@ sub check_rp{
461473 exe_cmd($cmd ,$verbose ,$dry );
462474}
463475
476+ if ($keep_tmp == 0){
477+ $cmd = " rm $out_dir /makeblastdb.log $out_dir /makeblastdb.err" ;
478+ exe_cmd($cmd ,$verbose ,$dry );
479+ }
480+
464481my @cegma_gffs = ();
465482my @buscos_keys = keys (%buscos );
466483
@@ -667,7 +684,18 @@ sub check_rp{
667684print STDERR " " x $length . " \r " ;
668685
669686if ($keep_tmp == 0){
670- my @db = (" ancestral_variants" ," ancestral_variants.pog" ," ancestral_variants.psd" ," ancestral_variants.psi" ," ancestral_variants.phr" ," ancestral_variants.psq" ," ancestral_variants.pin" );
687+ my @db = (" ancestral_variants" ,
688+ " ancestral_variants.pog" ,
689+ " ancestral_variants.psd" ,
690+ " ancestral_variants.psi" ,
691+ " ancestral_variants.phr" ,
692+ " ancestral_variants.psq" ,
693+ " ancestral_variants.pin" ,
694+ " ancestral_variants.pto" ,
695+ " ancestral_variants.ptf" ,
696+ " ancestral_variants.pot" ,
697+ " ancestral_variants.pos" ,
698+ " ancestral_variants.pdb" );
671699 foreach (@db ){
672700 if (-f " $out_dir /$_ " ){
673701 $cmd = " rm $out_dir /$_ " ;
@@ -710,6 +738,9 @@ sub check_rp{
710738}
711739
712740if (defined (can_run(" fathom" )) and defined (can_run(" forge" )) and defined (can_run(" hmm-assembler.pl" ))){
741+ my $snap_version = ` fathom 2>&1 | grep "version" | sed 's/^.*(version //;s/)\$ //'` ;
742+ chomp $snap_version ;
743+ print STDERR " INFO\t Using snap $snap_version \n " ;
713744 print STDERR " INFO\t Creating snap model $out_dir /$prefix .snap.hmm\n " ;
714745 if ($verbose == 1){
715746 print STDERR " CMD\t cd $out_dir \n " ;
@@ -724,9 +755,26 @@ sub check_rp{
724755 $cmd = " hmm-assembler.pl $prefix . > $prefix .snap.hmm 2> hmm-assembler.err" ;
725756 exe_cmd($cmd ,$verbose ,$dry );
726757 if ($keep_tmp == 0){
758+ $cmd = " rm genome.dna genome.ann" ;
759+ exe_cmd($cmd ,$verbose ,$dry );
727760 my @patterns = (" *count" ," *model" ," *duration" ," transitions" ," phaseprefs" ," alt.ann" ," alt.dna" ," err.ann" ," err.dna" ," export.aa" ," export.ann" ," export.dna" ," export.tx" ," olp.ann" ," olp.dna" ," uni.ann" ," uni.dna" ," wrn.ann" ," wrn.dna" );
728761 $cmd = " rm \$ (find -mindepth 1 -maxdepth 1 -type f -name \" " . join (" \" -or -name \" " ,@patterns ) . " \" )" ;
729762 exe_cmd($cmd ,$verbose ,$dry );
763+ my @logs = (" fathom.log" ,
764+ " fathom.err" ,
765+ " forge.log" ,
766+ " forge.err" ,
767+ " hmm-assembler.err" );
768+ my @rm_logs = ();
769+ foreach (@logs ){
770+ if (-z " $out_dir /$_ " ){
771+ push (@rm_logs ," $out_dir /$_ " );
772+ }
773+ }
774+ if (scalar (@rm_logs ) > 0){
775+ $cmd = " rm " . join (" " ,@rm_logs );
776+ exe_cmd($cmd ,$verbose ,$dry );
777+ }
730778 }
731779}
732780
0 commit comments