@@ -131,21 +131,17 @@ setup_file() {
131131
132132 outdir=${outprefix} taxids-leaves-ncbi/
133133 label=" test"
134- # Get all possible taxids from base assembly_summary
135- txids=( $( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 7 ) )
136- # echo ${txids[@]} >&3
137-
138- # Use third
139- run ./genome_updater.sh -d refseq -T " ${txids[2]} " -b ${label} -o ${outdir}
134+
135+ # Include only archaea taxids, remove fungi, based on taxid filter
136+ run ./genome_updater.sh -d refseq -g archaea,fungi -T ' ^4751,2157' -b ${label} -o ${outdir}
140137 sanity_check ${outdir} ${label}
141138
142- # Check if output contains only used taxids
143- txids_ret=( $ ( get_values_as ${outdir} assembly_summary.txt 7 ) )
144- # echo ${txids_ret[@]} >&3
139+ # Check if output only archaea accessions
140+ readarray -t archaea_acc < < ( get_values_as ${local_dir} genomes/refseq/archaea/ assembly_summary.txt 1 )
141+ readarray -t ret_acc < <( get_values_as ${outdir} assembly_summary.txt 1 )
145142
146- # Used taxid should be the only one
147- assert_equal ${# txids_ret[@]} 1 # length
148- assert_equal ${txids[2]} ${txids_ret[0]} # same taxid
143+ # Check if output is as expected (only archaea)
144+ assert_equal $( echo ${archaea_acc[@]} ${ret_acc[@]} | tr ' ' ' \n' | sort | uniq -d | wc -l) $( count_lines_file " ${local_dir} genomes/refseq/archaea/assembly_summary.txt" )
149145}
150146
151147@test " Taxids leaves gtdb" {
@@ -154,24 +150,24 @@ setup_file() {
154150 outdir=${outprefix} taxids-leaves-gtdb/
155151 label=" test"
156152 # Use fixed one
157- run ./genome_updater.sh -d refseq,genbank -T ' s__MWBV01 sp002069705 ' -b ${label} -o ${outdir} -g archaea -M gtdb
153+ run ./genome_updater.sh -d refseq,genbank -T ' o__Halobacteriales,^f__Haloferacaceae ' -b ${label} -o ${outdir} -g archaea -M gtdb
158154 sanity_check ${outdir} ${label}
159- assert [ $( count_files ${outdir} ${label} ) -eq 1 ]
155+ assert [ $( count_files ${outdir} ${label} ) -eq 7 ]
160156}
161157
162158@test " Refseq category" {
163159 outdir=${outprefix} refseq-category/
164160 label=" test"
165161 # Get all possible refseq category values from base assembly_summary
166- rscat=( $ ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 5 ) )
162+ readarray -t rscat < < ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 5)
167163 # echo ${rscat[@]} >&3
168164
169165 # Use first
170166 run ./genome_updater.sh -d refseq -c " ${rscat[0]} " -b ${label} -o ${outdir}
171167 sanity_check ${outdir} ${label}
172168
173169 # Check if output contains only selected refseq category
174- rscat_ret=( $ ( get_values_as ${outdir} assembly_summary.txt 5 ) )
170+ readarray -t rscat_ret < < ( get_values_as ${outdir} assembly_summary.txt 5)
175171 # echo ${rscat_ret[@]} >&3
176172
177173 # Should just return same refseq category
@@ -184,15 +180,15 @@ setup_file() {
184180 outdir=${outprefix} assembly-level/
185181 label=" test"
186182 # Get all possible assembly level values from base assembly_summary
187- aslev=( $ ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 12 ) )
183+ readarray -t aslev < < ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 12)
188184 # echo ${aslev[@]} >&3
189185
190186 # Use first
191187 run ./genome_updater.sh -d refseq -l " ${aslev[0]} " -b ${label} -o ${outdir}
192188 sanity_check ${outdir} ${label}
193189
194190 # Check if output contains only selected assembly level
195- aslev_ret=( $ ( get_values_as ${outdir} assembly_summary.txt 12 ) )
191+ readarray -t aslev_ret < < ( get_values_as ${outdir} assembly_summary.txt 12)
196192 # echo ${aslev_ret[@]} >&3
197193
198194 # Should just return same assembly level
@@ -206,28 +202,53 @@ setup_file() {
206202 label=" test"
207203
208204 # Get all possible assembly level values from base assembly_summary
209- rscat=( $ ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 5 ) )
210- aslev=( $ ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 12 ) )
205+ readarray -t rscat < < ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 5)
206+ readarray -t aslev < < ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 12)
211207
212208 # Simulate refseq category and assembly level filter using the custom filter
213- run ./genome_updater.sh -d refseq -F " 5: ${rscat[0]} |12: ${aslev[0]} " -b ${label} -o ${outdir}
209+ run ./genome_updater.sh -d refseq -F " \$ 5 == \" ${rscat[0]} \" && \$ 12 == \" ${aslev[0]} \" " -b ${label} -o ${outdir}
214210 sanity_check ${outdir} ${label}
215211
216212 # Check if output contains only selected refseq category
217- rscat_ret=( $ ( get_values_as ${outdir} assembly_summary.txt 5 ) )
213+ readarray -t rscat_ret < < ( get_values_as ${outdir} assembly_summary.txt 5)
218214 # Should just return same refseq category
219215 for rsc in ${rscat_ret[@]} ; do
220216 assert_equal ${rsc} ${rscat[0]}
221217 done
222218
223219 # Check if output contains only selected assembly level
224- aslev_ret=( $ ( get_values_as ${outdir} assembly_summary.txt 12 ) )
220+ readarray -t aslev_ret < < ( get_values_as ${outdir} assembly_summary.txt 12)
225221 # Should just return same assembly level
226222 for asl in ${aslev_ret[@]} ; do
227223 assert_equal ${asl} ${aslev[0]}
228224 done
229225}
230226
227+ @test " Custom filter regex" {
228+ outdir=${outprefix} custom-filter-regex/
229+ label=" test"
230+ pattern=" bacterium"
231+
232+ # Get all possible assembly level values from base assembly_summary
233+ readarray -t ogname < <( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 8)
234+ ogname_matches=$( printf ' %s\n' " ${ogname[@]} " | grep " ${pattern} " | wc -l)
235+
236+ # Return only entries matching pattern
237+ run ./genome_updater.sh -d refseq -F " \$ 8 ~ /" ${pattern} " /" -b ${label} -o ${outdir}
238+ sanity_check ${outdir} ${label}
239+
240+ # Check if all of matching patterns were returned
241+ assert [ $( count_files ${outdir} ${label} ) -eq ${ogname_matches} ]
242+
243+ # Check if output contains matching pattern
244+ readarray -t ogname_ret < <( get_values_as ${outdir} assembly_summary.txt 8)
245+ for ogn in " ${ogname_ret[@]} " ; do
246+ assert $( grep -q " ${pattern} " <<< $ogn )
247+ done
248+
249+
250+ }
251+
231252@test " Top 1 leaves ncbi" {
232253 outdir=${outprefix} top-leaves-ncbi/
233254 label=" test"
@@ -236,7 +257,7 @@ setup_file() {
236257 sanity_check ${outdir} ${label}
237258
238259 # Get counts of species taxids on output
239- txids_ret= $ ( get_values_as ${outdir} assembly_summary.txt 6 )
260+ readarray -t txids_ret < < ( get_values_as ${outdir} assembly_summary.txt 6)
240261 ret_occ=( $( echo ${txids_ret} | tr ' ' ' \n' | sort | uniq -c | awk ' {print $1}' ) )
241262
242263 # Should have one assembly for each species taxid
@@ -253,7 +274,7 @@ setup_file() {
253274 sanity_check ${outdir} ${label}
254275
255276 # Get counts of species taxids on output
256- txids_ret= $ ( get_values_as ${outdir} assembly_summary.txt 7 )
277+ readarray -t txids_ret < < ( get_values_as ${outdir} assembly_summary.txt 7)
257278 ret_occ=( $( echo ${txids_ret} | tr ' ' ' \n' | sort | uniq -c | awk ' {print $1}' ) )
258279
259280 # Should have one assembly for each species taxid
@@ -335,25 +356,25 @@ setup_file() {
335356 # should always pick the correct assembly level for top superkingdom (just one)
336357
337358 label=" 4"
338- aslvl=" complete genome,chromosome,scaffold,contig "
359+ aslvl=" Complete Genome,Chromosome,Scaffold,Contig "
339360 run ./genome_updater.sh -d refseq -g archaea -l " ${aslvl} " -A superkingdom:1 -b ${label} -o ${outdir}
340361 sanity_check ${outdir} ${label}
341362 assert_equal " Complete Genome" " $( get_values_as ${outdir} assembly_summary.txt 12) "
342363
343364 label=" 3"
344- aslvl=" chromosome,scaffold,contig "
365+ aslvl=" Chromosome,Scaffold,Contig "
345366 run ./genome_updater.sh -d refseq -g archaea -l " ${aslvl} " -A superkingdom:1 -b ${label} -o ${outdir}
346367 sanity_check ${outdir} ${label}
347368 assert_equal " Chromosome" " $( get_values_as ${outdir} assembly_summary.txt 12) "
348369
349370 label=" 2"
350- aslvl=" scaffold,contig "
371+ aslvl=" Scaffold,Contig "
351372 run ./genome_updater.sh -d refseq -g archaea -l " ${aslvl} " -A superkingdom:1 -b ${label} -o ${outdir}
352373 sanity_check ${outdir} ${label}
353374 assert_equal " Scaffold" " $( get_values_as ${outdir} assembly_summary.txt 12) "
354375
355376 label=" 1"
356- aslvl=" contig "
377+ aslvl=" Contig "
357378 run ./genome_updater.sh -d refseq -g archaea -l " ${aslvl} " -A superkingdom:1 -b ${label} -o ${outdir}
358379 sanity_check ${outdir} ${label}
359380 assert_equal " Contig" " $( get_values_as ${outdir} assembly_summary.txt 12) "
@@ -365,7 +386,7 @@ setup_file() {
365386 outdir=${outprefix} date-start-filter/
366387
367388 # Get all possible dates and sort it
368- dates=( $ ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 15 | sed ' s|/||g' | sort) )
389+ readarray -t dates < < ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 15 | sed ' s|/||g' | sort)
369390
370391 label=" test_all"
371392 # Use first date as start, should return everything
@@ -384,7 +405,7 @@ setup_file() {
384405 outdir=${outprefix} date-end-filter/
385406
386407 # Get all possible dates and sort it
387- dates=( $ ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 15 | sed ' s|/||g' | sort) )
408+ readarray -t dates < < ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 15 | sed ' s|/||g' | sort)
388409
389410 label=" test_all"
390411 # Use last date as end, should return everything
@@ -403,7 +424,7 @@ setup_file() {
403424 outdir=${outprefix} date-start-end-filter/
404425
405426 # Get all possible dates and sort it
406- dates=( $ ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 15 | sed ' s|/||g' | sort) )
427+ readarray -t dates < < ( get_values_as ${local_dir} genomes/refseq/assembly_summary_refseq.txt 15 | sed ' s|/||g' | sort)
407428
408429 label=" test_all"
409430 # Use first date as start, last as end, should return everything
0 commit comments