Skip to content

Commit f687c6a

Browse files
author
Tilman Schell
committed
Version 0.2
1 parent 4fbf1d3 commit f687c6a

File tree

2 files changed

+110
-65
lines changed

2 files changed

+110
-65
lines changed

README.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
# busco2snap.pl v0.1
1+
# busco2snap.pl v0.2
22

33
## Description
44
__Creation of a SNAP model from [BUSCO](https://gitlab.com/ezlab/busco) output.__
55

66
Using [MAKER's](https://www.yandell-lab.org/software/maker.html) `cegma2zff` as archetype, `busco2snap.pl` creates a SNAP model from complete, single-copy BUSCOs by predicting the gene structure with the retrained Augustus model and converting to zff format. The tools `augustus` and `blastp` need to be in your `$PATH`. If `fathom`, `forge` and `hmm-assembler.pl` are in your `$PATH` the SNAP model will be created automatically.
77
If Augustus predicts more than one gene at a BUSCO locus, the predicted proteins are blasted against the ancestral variants of the corresponding BUSCO lineage. The gene(s) where the best hit (defined by highest bit score) is on the searched BUSCO are used only.
88

9-
___Currently BUSCO 3.0.2 output is supported only___
9+
#### Tested with BUSCO versions 3.0.2, 4.1.4 and 5.2.2
1010

1111
## Dependencies
1212

@@ -23,13 +23,10 @@ Optional:
2323
## Usage
2424

2525
```
26-
busco2snap.pl [-b <busco_dir> | -ft <full_table> -rp <retraining_parameters>]
26+
busco2snap.pl -b <busco_dir>
2727
2828
Mandatory:
2929
-b STR BUSCO output directory (run_*)
30-
OR
31-
-ft STR BUSCOs full table file (full_table_*)
32-
-rp STR Directory containing retrining parameters from Augustus
3330
(augustus_output/retraining_parameters/)
3431
3532
Options: [default]

busco2snap.pl

Lines changed: 107 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
use IPC::Cmd qw[can_run run];
77
use Parallel::Loops;
88

9-
my $version = "0.1";
9+
my $version = "0.2";
1010

1111
my $busco_dir = "";
1212
my $full_table = "";
@@ -28,6 +28,8 @@
2828
my @error_args = ();
2929
my $length = 0;
3030
my %genome_ann;
31+
my $busco_version;
32+
my $busco_major;
3133

3234
my $input_error = 0;
3335

@@ -36,17 +38,14 @@ sub print_help{
3638
print STDOUT "busco2snap.pl v$version\n";
3739
print STDOUT "\n";
3840
print STDOUT "Description:\n";
39-
print STDOUT "\tCreation of a SNAP model from BUSCOs by predicting the gene structure with the retrained\n\tAugustus model, converting to zff format. The tools augustus and blastp need to be in\n\tyour \$PATH. If fathom, forge and hmm-assembler.pl are in your \$PATH the SNAP model will be\n\tcreated automatically.\n";
40-
print STDOUT "\tIf Augustus predicts more than one gene at a BUSCO locus the predicted proteins are blasted\n\tagainst the ancestral variants of the corresponding BUSCO lineage. The gene(s) where the best\n\thit (defined by highest bit score) is on the searched BUSCO are used only.\n";
41+
print STDOUT "\tCreation of a SNAP model from BUSCOs by predicting the gene structure with the retrained\n\taugustus model, converting to zff format. The tools augustus and blastp need to be in\n\tyour \$PATH. If fathom, forge and hmm-assembler are in your \$PATH the SNAP model will be\n\tcreated automatically.\n";
42+
print STDOUT "\tIf augustus predicts more than one gene at a BUSCO locus the predicted proteins are blasted\n\tagainst the ancestral variants of the corresponding BUSCO lineage. The gene(s) where the best\n\thit (defined by highest bit score) is on the searched BUSCO are used only.\n";
4143
print STDOUT "\n";
4244
print STDOUT "Usage:\n";
43-
print STDOUT "\tbusco2snap.pl [-b <busco_dir> | -ft <full_table> -rp <retraining_parameters>]\n";
45+
print STDOUT "\tbusco2snap.pl -b <busco_dir>\n";
4446
print STDOUT "\n";
4547
print STDOUT "Mandatory:\n";
4648
print STDOUT "\t-b STR\t\tBUSCO output directory (run_*)\n";
47-
print STDOUT "\tOR\n";
48-
print STDOUT "\t-ft STR\t\tBUSCOs full table file (full_table_*)\n";
49-
print STDOUT "\t-rp STR\t\tDirectory containing retrining parameters from Augustus\n\t\t\t(augustus_output/retraining_parameters/)\n";
5049
print STDOUT "\n";
5150
print STDOUT "Options: [default]\n";
5251
print STDOUT "\t-dup\t\tInclude duplicated BUSCOs [off]\n";
@@ -82,7 +81,7 @@ sub empty_element{
8281

8382
sub check_rp{
8483
my ($rp_dir) = @_;
85-
opendir (DIR, $rp_dir) or die "ERROR\tcould not open directory $rp_dir\n";
84+
opendir (DIR, $rp_dir) or die "ERROR\tCould not open directory $rp_dir\n";
8685

8786
my $base = "";
8887
while (my $file = readdir(DIR)){
@@ -108,7 +107,7 @@ sub check_rp{
108107
next;
109108
}
110109
else{
111-
print STDERR "ERROR\tUnrecognized Augustus model part $file in $rp_dir\n";
110+
print STDERR "ERROR\tUnrecognized augustus model part $file in $rp_dir\n";
112111
$input_error = 1;
113112
}
114113
}
@@ -133,7 +132,7 @@ sub check_rp{
133132
next;
134133
}
135134
else{
136-
print STDERR "ERROR\tUnrecognized Augustus model part $file in $rp_dir\n";
135+
print STDERR "ERROR\tUnrecognized augustus model part $file in $rp_dir\n";
137136
$input_error = 1;
138137
}
139138
}
@@ -149,14 +148,6 @@ sub check_rp{
149148
$busco_dir = abs_path($ARGV[$i+1]);
150149
empty_element($i,2);
151150
}
152-
if ($ARGV[$i] eq "-ft"){
153-
$full_table = abs_path($ARGV[$i+1]);
154-
empty_element($i,2);
155-
}
156-
if ($ARGV[$i] eq "-rp"){
157-
$retraining_parameters = abs_path($ARGV[$i+1]);
158-
empty_element($i,2);
159-
}
160151
if ($ARGV[$i] eq "-dup"){
161152
$duplicated = 1;
162153
empty_element($i,1);
@@ -210,17 +201,25 @@ sub check_rp{
210201
$input_error = 1;
211202
}
212203

213-
if($full_table eq "" and $retraining_parameters eq "" and $busco_dir ne ""){
214-
215-
$full_table = `find $busco_dir -mindepth 1 -maxdepth 1 -type f -name "full_table_*"`;
204+
if($busco_dir ne ""){
205+
$full_table = `find $busco_dir -mindepth 1 -maxdepth 1 -type f -name "full_table*"`;
216206
chomp $full_table;
217207
if($full_table eq ""){
218208
print STDERR "ERROR\tfull_table not found\n";
219209
$input_error = 1;
220210
}
211+
$busco_version = `grep "BUSCO version" $full_table`;
212+
chomp $busco_version;
213+
$busco_version = (split(/ /,$busco_version))[-1];
214+
$busco_major = (split(/\./,$busco_version))[0];
215+
print STDERR "INFO\tDetected BUSCO version $busco_version\n";
221216

222217
$retraining_parameters = `find $busco_dir -mindepth 2 -maxdepth 2 -type d -name "retraining_parameters"`;
223218
chomp $retraining_parameters;
219+
if($busco_major == 4 or $busco_major == 5){
220+
$retraining_parameters = `find $retraining_parameters -mindepth 1 -maxdepth 1 -type d -name "BUSCO_*"`;
221+
chomp $retraining_parameters;
222+
}
224223
if($retraining_parameters eq ""){
225224
print STDERR "ERROR\tretraining_parameters not found\n";
226225
$input_error = 1;
@@ -230,29 +229,8 @@ sub check_rp{
230229
}
231230
}
232231
else{
233-
if($full_table eq ""){
234-
print STDERR "ERROR\t-ft not specified\n";
235-
$input_error = 1;
236-
}
237-
else{
238-
if(not -f $full_table){
239-
print STDERR "ERROR\t$full_table is not a file\n";
240-
$input_error = 1;
241-
}
242-
}
243-
if($retraining_parameters eq ""){
244-
print STDERR "ERROR\t-rp not specified\n";
245-
$input_error = 1;
246-
}
247-
else{
248-
if(not -d $retraining_parameters){
249-
print STDERR "ERROR\t$retraining_parameters is not a directory\n";
250-
$input_error = 1;
251-
}
252-
else{
253-
$species = check_rp($retraining_parameters);
254-
}
255-
}
232+
print STDERR "ERROR\t-b must be specified\n";
233+
$input_error = 1;
256234
}
257235

258236
if($threads !~ m/^\d+$/ or $threads <= 0){
@@ -269,11 +247,21 @@ sub check_rp{
269247
print STDERR "ERROR\taugustus is not in your \$PATH\n";
270248
$input_error = 1;
271249
}
250+
else{
251+
my $augustus_version = `augustus --version 2>&1 | head -n 1 | sed 's/.*(//;s/).*//'`;
252+
chomp $augustus_version;
253+
print STDERR "INFO\tUsing augustus $augustus_version\n";
254+
}
272255

273256
if(not defined(can_run("blastp"))){
274257
print STDERR "ERROR\tblastp is not in your \$PATH\n";
275258
$input_error = 1;
276259
}
260+
else{
261+
my $blastp_version = `blastp -version | head -n 1 | sed 's/.*: //'`;
262+
chomp $blastp_version;
263+
print STDERR "INFO\tUsing blastp $blastp_version\n";
264+
}
277265

278266
if(not defined($ENV{'AUGUSTUS_CONFIG_PATH'})){
279267
print STDERR "ERROR\t\$AUGUSTUS_CONFIG_PATH needs to be set\n";
@@ -343,24 +331,48 @@ sub check_rp{
343331

344332
print STDERR "INFO\tExtracting BUSCOs and their locations from $full_table\n";
345333

346-
open (FT, '<', $full_table) or die "ERROR\tcould not open $full_table\n";
334+
if($busco_major == 4 or $busco_major == 5){
335+
my $busco_log = abs_path("$busco_dir/../logs/busco.log");
336+
open (LOG, '<', $busco_log) or die "ERROR\tCould not open $busco_log\n";
337+
while (my $line = <LOG>){
338+
chomp $line;
339+
if($line =~ m/_input_filepath/){
340+
$target_fasta = $line;
341+
$target_fasta =~ s/^.*': '//;
342+
$target_fasta =~ s/',$//;
343+
}
344+
if($line =~ m/lineage_dataset/){
345+
$lineage_path = $line;
346+
$lineage_path =~ s/^.*': '//;
347+
$lineage_path =~ s/',$//;
348+
}
349+
}
350+
close LOG;
351+
}
352+
353+
open (FT, '<', $full_table) or die "ERROR\tCould not open $full_table\n";
347354

348355
while (my $line = <FT>){
349356
if($line =~ m/^#/){
350-
if($line =~ m/^# To reproduce this run: /){
351-
my @run = split(/ /,$line);
352-
for(my $i = 0; $i < scalar(@run); $i++){
353-
if($run[$i] eq "-i"){
354-
$target_fasta = $run[$i+1];
355-
}
356-
if($run[$i] eq "-l"){
357-
$lineage_path = $run[$i+1];
358-
if(substr($lineage_path,-1) eq "/"){
359-
$lineage_path = substr($lineage_path,0,-1);
357+
if($busco_major < 4){
358+
if($line =~ m/^# To reproduce this run: /){
359+
my @run = split(/ /,$line);
360+
for(my $i = 0; $i < scalar(@run); $i++){
361+
if($run[$i] eq "-i"){
362+
$target_fasta = $run[$i+1];
363+
}
364+
if($run[$i] eq "-l"){
365+
$lineage_path = $run[$i+1];
366+
if(substr($lineage_path,-1) eq "/"){
367+
$lineage_path = substr($lineage_path,0,-1);
368+
}
360369
}
361370
}
371+
next;
372+
}
373+
else{
374+
next;
362375
}
363-
next;
364376
}
365377
else{
366378
next;
@@ -412,7 +424,7 @@ sub check_rp{
412424
my $fa_frac = 0;
413425

414426
if($dry == 0){
415-
open (FA, '<', $target_fasta) or die "ERROR\tcould not open $target_fasta\n";
427+
open (FA, '<', $target_fasta) or die "ERROR\tCould not open $target_fasta\n";
416428

417429
my $head = "";
418430
my $out_file = "";
@@ -435,7 +447,7 @@ sub check_rp{
435447

436448
$out_file = $head . ".fa";
437449
$out_file =~ tr/[\/><|:&\\,;?*]/_/;
438-
open (OUT, '>', "$out_dir/$out_file") or die "ERROR\tcould not open $out_dir/$out_file\n";
450+
open (OUT, '>', "$out_dir/$out_file") or die "ERROR\tCould not open $out_dir/$out_file\n";
439451
push(@out_fa,"$out_dir/$out_file");
440452
}
441453
}
@@ -461,6 +473,11 @@ sub check_rp{
461473
exe_cmd($cmd,$verbose,$dry);
462474
}
463475

476+
if ($keep_tmp == 0){
477+
$cmd = "rm $out_dir/makeblastdb.log $out_dir/makeblastdb.err";
478+
exe_cmd($cmd,$verbose,$dry);
479+
}
480+
464481
my @cegma_gffs = ();
465482
my @buscos_keys = keys(%buscos);
466483

@@ -667,7 +684,18 @@ sub check_rp{
667684
print STDERR " " x $length . "\r";
668685

669686
if($keep_tmp == 0){
670-
my @db = ("ancestral_variants","ancestral_variants.pog","ancestral_variants.psd","ancestral_variants.psi","ancestral_variants.phr","ancestral_variants.psq","ancestral_variants.pin");
687+
my @db = ("ancestral_variants",
688+
"ancestral_variants.pog",
689+
"ancestral_variants.psd",
690+
"ancestral_variants.psi",
691+
"ancestral_variants.phr",
692+
"ancestral_variants.psq",
693+
"ancestral_variants.pin",
694+
"ancestral_variants.pto",
695+
"ancestral_variants.ptf",
696+
"ancestral_variants.pot",
697+
"ancestral_variants.pos",
698+
"ancestral_variants.pdb");
671699
foreach(@db){
672700
if(-f "$out_dir/$_"){
673701
$cmd = "rm $out_dir/$_";
@@ -710,6 +738,9 @@ sub check_rp{
710738
}
711739

712740
if(defined(can_run("fathom")) and defined(can_run("forge")) and defined(can_run("hmm-assembler.pl"))){
741+
my $snap_version = `fathom 2>&1 | grep "version" | sed 's/^.*(version //;s/)\$//'`;
742+
chomp $snap_version;
743+
print STDERR "INFO\tUsing snap $snap_version \n";
713744
print STDERR "INFO\tCreating snap model $out_dir/$prefix.snap.hmm\n";
714745
if($verbose == 1){
715746
print STDERR "CMD\tcd $out_dir\n";
@@ -724,9 +755,26 @@ sub check_rp{
724755
$cmd = "hmm-assembler.pl $prefix . > $prefix.snap.hmm 2> hmm-assembler.err";
725756
exe_cmd($cmd,$verbose,$dry);
726757
if($keep_tmp == 0){
758+
$cmd = "rm genome.dna genome.ann";
759+
exe_cmd($cmd,$verbose,$dry);
727760
my @patterns = ("*count","*model","*duration","transitions","phaseprefs","alt.ann","alt.dna","err.ann","err.dna","export.aa","export.ann","export.dna","export.tx","olp.ann","olp.dna","uni.ann","uni.dna","wrn.ann","wrn.dna");
728761
$cmd = "rm \$(find -mindepth 1 -maxdepth 1 -type f -name \"" . join("\" -or -name \"",@patterns) . "\")";
729762
exe_cmd($cmd,$verbose,$dry);
763+
my @logs = ("fathom.log",
764+
"fathom.err",
765+
"forge.log",
766+
"forge.err",
767+
"hmm-assembler.err");
768+
my @rm_logs = ();
769+
foreach(@logs){
770+
if (-z "$out_dir/$_"){
771+
push(@rm_logs,"$out_dir/$_");
772+
}
773+
}
774+
if (scalar(@rm_logs) > 0){
775+
$cmd = "rm " . join(" ",@rm_logs);
776+
exe_cmd($cmd,$verbose,$dry);
777+
}
730778
}
731779
}
732780

0 commit comments

Comments
 (0)