Skip to content

Commit 5db906e

Browse files
authored
v0.7.0 (#101)
* Allow direct filter with awk * Add tests, fix tests * update tests * Add shellcheck fixes * Add shellcheck * fix coverage travis * fix travis * Report unchanged entries on update * Allow negation taxonomy NCBI * taxid negation gtdb, tests * change GTDB link * update readme * update readme * update README * update README * update README * update README
1 parent fabfbc0 commit 5db906e

File tree

10 files changed

+587
-401
lines changed

10 files changed

+587
-401
lines changed

.travis.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
language: bash
2-
dist: jammy
2+
dist: noble
33

44
before_install:
5-
- gem install bashcov codecov
65
- sudo apt-get install parallel
7-
- echo -e "require 'codecov'\nrequire 'simplecov'\nSimpleCov.formatter = Codecov::SimpleCov::Formatter" > .simplecov
6+
- gem install bashcov codecov
7+
- echo -e "SimpleCov.start do\n add_filter 'tests/'\nend" > .simplecov
88

99
script:
10-
- bashcov tests/libs/bats/bin/bats tests/integration_offline.bats
10+
- shellcheck genome_updater.sh
11+
- bashcov --skip-uncovered tests/libs/bats/bin/bats tests/integration_offline.bats
1112

1213
after_success:
1314
- curl -Os https://uploader.codecov.io/latest/linux/codecov
1415
- chmod +x codecov
15-
- ./codecov -f coverage/codecov-result.json -Z
16+
- ./codecov -f coverage/.resultset.json -Z
1617

1718
notifications:
1819
email: false

LICENSE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
The MIT License (MIT)
22

3-
Copyright (c) 2023 - Vitor C. Piro - http://github.com/pirovc
3+
Copyright (c) 2025 - Vitor C. Piro - http://github.com/pirovc
44
All rights reserved.
55

66
Permission is hereby granted, free of charge, to any person obtaining a copy

README.md

Lines changed: 260 additions & 154 deletions
Large diffs are not rendered by default.

genome_updater.sh

Lines changed: 254 additions & 205 deletions
Large diffs are not rendered by default.

pre-commit.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/usr/bin/env bash
2+
3+
if shellcheck genome_updater.sh
4+
then
5+
echo "shellcheck found no issues!"
6+
fi
7+
8+
echo -e "SimpleCov.start do\n add_filter 'tests/'\nend" > .simplecov
9+
bashcov --skip-uncovered tests/libs/bats/bin/bats tests/integration_offline.bats

tests/integration_offline.bats

Lines changed: 53 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -131,21 +131,17 @@ setup_file() {
131131

132132
outdir=${outprefix}taxids-leaves-ncbi/
133133
label="test"
134-
# Get all possible taxids from base assembly_summary
135-
txids=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 7 ) )
136-
#echo ${txids[@]} >&3
137-
138-
# Use third
139-
run ./genome_updater.sh -d refseq -T "${txids[2]}" -b ${label} -o ${outdir}
134+
135+
# Include only archaea taxids, remove fungi, based on taxid filter
136+
run ./genome_updater.sh -d refseq -g archaea,fungi -T '^4751,2157' -b ${label} -o ${outdir}
140137
sanity_check ${outdir} ${label}
141138

142-
# Check if output contains only used taxids
143-
txids_ret=( $(get_values_as ${outdir}assembly_summary.txt 7 ) )
144-
#echo ${txids_ret[@]} >&3
139+
# Check if output only archaea accessions
140+
readarray -t archaea_acc < <(get_values_as ${local_dir}genomes/refseq/archaea/assembly_summary.txt 1)
141+
readarray -t ret_acc < <(get_values_as ${outdir}assembly_summary.txt 1)
145142

146-
# Used taxid should be the only one
147-
assert_equal ${#txids_ret[@]} 1 #length
148-
assert_equal ${txids[2]} ${txids_ret[0]} #same taxid
143+
# Check if output is as expected (only archaea)
144+
assert_equal $(echo ${archaea_acc[@]} ${ret_acc[@]} | tr ' ' '\n' | sort | uniq -d | wc -l) $(count_lines_file "${local_dir}genomes/refseq/archaea/assembly_summary.txt")
149145
}
150146

151147
@test "Taxids leaves gtdb" {
@@ -154,24 +150,24 @@ setup_file() {
154150
outdir=${outprefix}taxids-leaves-gtdb/
155151
label="test"
156152
# Use fixed one
157-
run ./genome_updater.sh -d refseq,genbank -T 's__MWBV01 sp002069705' -b ${label} -o ${outdir} -g archaea -M gtdb
153+
run ./genome_updater.sh -d refseq,genbank -T 'o__Halobacteriales,^f__Haloferacaceae' -b ${label} -o ${outdir} -g archaea -M gtdb
158154
sanity_check ${outdir} ${label}
159-
assert [ $(count_files ${outdir} ${label}) -eq 1 ]
155+
assert [ $(count_files ${outdir} ${label}) -eq 7 ]
160156
}
161157

162158
@test "Refseq category" {
163159
outdir=${outprefix}refseq-category/
164160
label="test"
165161
# Get all possible refseq category values from base assembly_summary
166-
rscat=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 5 ) )
162+
readarray -t rscat < <(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 5)
167163
#echo ${rscat[@]} >&3
168164

169165
# Use first
170166
run ./genome_updater.sh -d refseq -c "${rscat[0]}" -b ${label} -o ${outdir}
171167
sanity_check ${outdir} ${label}
172168

173169
# Check if output contains only selected refseq category
174-
rscat_ret=( $(get_values_as ${outdir}assembly_summary.txt 5 ) )
170+
readarray -t rscat_ret < <(get_values_as ${outdir}assembly_summary.txt 5)
175171
#echo ${rscat_ret[@]} >&3
176172

177173
# Should just return same refseq category
@@ -184,15 +180,15 @@ setup_file() {
184180
outdir=${outprefix}assembly-level/
185181
label="test"
186182
# Get all possible assembly level values from base assembly_summary
187-
aslev=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 12 ) )
183+
readarray -t aslev < <(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 12)
188184
#echo ${aslev[@]} >&3
189185

190186
# Use first
191187
run ./genome_updater.sh -d refseq -l "${aslev[0]}" -b ${label} -o ${outdir}
192188
sanity_check ${outdir} ${label}
193189

194190
# Check if output contains only selected assembly level
195-
aslev_ret=( $(get_values_as ${outdir}assembly_summary.txt 12 ) )
191+
readarray -t aslev_ret < <(get_values_as ${outdir}assembly_summary.txt 12)
196192
#echo ${aslev_ret[@]} >&3
197193

198194
# Should just return same assembly level
@@ -206,28 +202,53 @@ setup_file() {
206202
label="test"
207203

208204
# Get all possible assembly level values from base assembly_summary
209-
rscat=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 5 ) )
210-
aslev=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 12 ) )
205+
readarray -t rscat < <(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 5)
206+
readarray -t aslev < <(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 12)
211207

212208
# Simulate refseq category and assembly level filter using the custom filter
213-
run ./genome_updater.sh -d refseq -F "5:${rscat[0]}|12:${aslev[0]}" -b ${label} -o ${outdir}
209+
run ./genome_updater.sh -d refseq -F "\$5 == \"${rscat[0]}\" && \$12 == \"${aslev[0]}\"" -b ${label} -o ${outdir}
214210
sanity_check ${outdir} ${label}
215211

216212
# Check if output contains only selected refseq category
217-
rscat_ret=( $(get_values_as ${outdir}assembly_summary.txt 5 ) )
213+
readarray -t rscat_ret < <(get_values_as ${outdir}assembly_summary.txt 5)
218214
# Should just return same refseq category
219215
for rsc in ${rscat_ret[@]}; do
220216
assert_equal ${rsc} ${rscat[0]}
221217
done
222218

223219
# Check if output contains only selected assembly level
224-
aslev_ret=( $(get_values_as ${outdir}assembly_summary.txt 12 ) )
220+
readarray -t aslev_ret < <(get_values_as ${outdir}assembly_summary.txt 12)
225221
# Should just return same assembly level
226222
for asl in ${aslev_ret[@]}; do
227223
assert_equal ${asl} ${aslev[0]}
228224
done
229225
}
230226

227+
@test "Custom filter regex" {
228+
outdir=${outprefix}custom-filter-regex/
229+
label="test"
230+
pattern="bacterium"
231+
232+
# Get all possible assembly level values from base assembly_summary
233+
readarray -t ogname < <(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 8)
234+
ogname_matches=$(printf '%s\n' "${ogname[@]}" | grep "${pattern}" | wc -l)
235+
236+
# Return only entries matching pattern
237+
run ./genome_updater.sh -d refseq -F "\$8 ~ /"${pattern}"/" -b ${label} -o ${outdir}
238+
sanity_check ${outdir} ${label}
239+
240+
# Check if all of matching patterns were returned
241+
assert [ $(count_files ${outdir} ${label}) -eq ${ogname_matches} ]
242+
243+
# Check if output contains matching pattern
244+
readarray -t ogname_ret < <(get_values_as ${outdir}assembly_summary.txt 8)
245+
for ogn in "${ogname_ret[@]}"; do
246+
assert $(grep -q "${pattern}" <<< $ogn)
247+
done
248+
249+
250+
}
251+
231252
@test "Top 1 leaves ncbi" {
232253
outdir=${outprefix}top-leaves-ncbi/
233254
label="test"
@@ -236,7 +257,7 @@ setup_file() {
236257
sanity_check ${outdir} ${label}
237258

238259
# Get counts of species taxids on output
239-
txids_ret=$(get_values_as ${outdir}assembly_summary.txt 6 )
260+
readarray -t txids_ret < <(get_values_as ${outdir}assembly_summary.txt 6)
240261
ret_occ=( $( echo ${txids_ret} | tr ' ' '\n' | sort | uniq -c | awk '{print $1}' ) )
241262

242263
# Should have one assembly for each species taxid
@@ -253,7 +274,7 @@ setup_file() {
253274
sanity_check ${outdir} ${label}
254275

255276
# Get counts of species taxids on output
256-
txids_ret=$(get_values_as ${outdir}assembly_summary.txt 7 )
277+
readarray -t txids_ret < <(get_values_as ${outdir}assembly_summary.txt 7)
257278
ret_occ=( $( echo ${txids_ret} | tr ' ' '\n' | sort | uniq -c | awk '{print $1}' ) )
258279

259280
# Should have one assembly for each species taxid
@@ -335,25 +356,25 @@ setup_file() {
335356
# should always pick the correct assembly level for top superkingdom (just one)
336357

337358
label="4"
338-
aslvl="complete genome,chromosome,scaffold,contig"
359+
aslvl="Complete Genome,Chromosome,Scaffold,Contig"
339360
run ./genome_updater.sh -d refseq -g archaea -l "${aslvl}" -A superkingdom:1 -b ${label} -o ${outdir}
340361
sanity_check ${outdir} ${label}
341362
assert_equal "Complete Genome" "$(get_values_as ${outdir}assembly_summary.txt 12)"
342363

343364
label="3"
344-
aslvl="chromosome,scaffold,contig"
365+
aslvl="Chromosome,Scaffold,Contig"
345366
run ./genome_updater.sh -d refseq -g archaea -l "${aslvl}" -A superkingdom:1 -b ${label} -o ${outdir}
346367
sanity_check ${outdir} ${label}
347368
assert_equal "Chromosome" "$(get_values_as ${outdir}assembly_summary.txt 12)"
348369

349370
label="2"
350-
aslvl="scaffold,contig"
371+
aslvl="Scaffold,Contig"
351372
run ./genome_updater.sh -d refseq -g archaea -l "${aslvl}" -A superkingdom:1 -b ${label} -o ${outdir}
352373
sanity_check ${outdir} ${label}
353374
assert_equal "Scaffold" "$(get_values_as ${outdir}assembly_summary.txt 12)"
354375

355376
label="1"
356-
aslvl="contig"
377+
aslvl="Contig"
357378
run ./genome_updater.sh -d refseq -g archaea -l "${aslvl}" -A superkingdom:1 -b ${label} -o ${outdir}
358379
sanity_check ${outdir} ${label}
359380
assert_equal "Contig" "$(get_values_as ${outdir}assembly_summary.txt 12)"
@@ -365,7 +386,7 @@ setup_file() {
365386
outdir=${outprefix}date-start-filter/
366387

367388
# Get all possible dates and sort it
368-
dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) )
389+
readarray -t dates < <(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort)
369390

370391
label="test_all"
371392
# Use first date as start, should return everything
@@ -384,7 +405,7 @@ setup_file() {
384405
outdir=${outprefix}date-end-filter/
385406

386407
# Get all possible dates and sort it
387-
dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) )
408+
readarray -t dates < <(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort)
388409

389410
label="test_all"
390411
# Use last date as end, should return everything
@@ -403,7 +424,7 @@ setup_file() {
403424
outdir=${outprefix}date-start-end-filter/
404425

405426
# Get all possible dates and sort it
406-
dates=( $(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort) )
427+
readarray -t dates < <(get_values_as ${local_dir}genomes/refseq/assembly_summary_refseq.txt 15 | sed 's|/||g' | sort)
407428

408429
label="test_all"
409430
# Use first date as start, last as end, should return everything

tests/integration_online.bats

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ setup_file() {
167167
sanity_check ${outdir} ${label}
168168

169169
# Get counts of taxids on output
170-
txids_ret=$(get_values_as ${outdir}assembly_summary.txt 6 )
170+
readarray -t txids_ret < <(get_values_as ${outdir}assembly_summary.txt 6)
171171
ret_occ=( $( echo ${txids_ret} | tr ' ' '\n' | sort | uniq -c | awk '{print $1}' ) )
172172

173173
# Should have one assembly for each species taxid

tests/libs/bats

Submodule bats updated 120 files

0 commit comments

Comments
 (0)