Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions bin/merge_annotations_into_pkl.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def process_pickle(pkl_path: Path, annotations: Dict[str, Dict[str, Any]], outpu
print(f"Annotated pickle saved to: {output_path}")


def process_input_folder(input_folder: Path, json_path: Path, output_folder: Path) -> None:
def process_input_folder(input_folder: Path, json_path: Path, output_folder: Path, infile_ext: str) -> None:
annotations = load_annotations(json_path)

pickle_files = list(input_folder.glob("*.cds-only.pkl"))
pickle_files = list(input_folder.glob(f"*.cds-only.{infile_ext}"))

if not pickle_files:
print(f"No pickle files found in {input_folder}")
Expand All @@ -112,7 +112,7 @@ def process_input_folder(input_folder: Path, json_path: Path, output_folder: Pat
output_folder.mkdir(parents=True, exist_ok=True)

for pkl_path in pickle_files:
new_name = pkl_path.name.replace(".cds-only.pkl", ".cds-annotated.pkl")
new_name = pkl_path.name.replace(f".cds-only.{infile_ext}", f".cds-annotated.{infile_ext}")
output_path = output_folder / new_name

process_pickle(pkl_path, annotations, output_path)
Expand All @@ -122,12 +122,14 @@ def process_input_folder(input_folder: Path, json_path: Path, output_folder: Pat

def main():
p = argparse.ArgumentParser(description="Annotate CDS pickles with information from JSON annotations")
p.add_argument("--pickle_folder", required=True, help="Folder containing .cds-only.pkl files")
p.add_argument("--pickle_folder", required=True, help="Folder containing .cds-only.`infile_ext` files")
p.add_argument("--infile_ext", action="store", help="Input file extension")
p.add_argument("--annotations", required=True, help="bulk_protein_annotations.json file")
p.add_argument("--out", default="annotated_pkl", help="Output directory")
args = p.parse_args()

input_folder = Path(args.pickle_folder)
infile_ext = str(args.infile_ext)
json_path = Path(args.annotations)
output_folder = Path(args.out)

Expand All @@ -139,7 +141,7 @@ def main():
print(f"Annotations JSON file does not exist: {json_path}")
return

process_input_folder(input_folder, json_path, output_folder)
process_input_folder(input_folder, json_path, output_folder, infile_ext)


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ workflow {

cds_pkl_list_ch = all_cds_outputs
.flatten()
.filter { it.name.endsWith('.pkl') }
.filter { it.name.endsWith(".${params.serializer_ext}") }
.collect()

ch_cds_pkl = cds_pkl_list_ch.flatten()
Expand Down Expand Up @@ -142,7 +142,7 @@ workflow {

ch_rna_pkl = rna_outputs
.flatten()
.filter { it.name.endsWith('.pkl') }
.filter { it.name.endsWith(".${params.serializer_ext}") }

//-----------------------------
// SORF extra search
Expand Down
2 changes: 1 addition & 1 deletion modules/detect_pseudogenes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ process DETECT_PSEUDOGENES {
tuple path(manifest_file), path(bakta_db)

output:
path("cds_with_pseudogenes/*with_pseudogenes.pkl")
path("cds_with_pseudogenes/*with_pseudogenes.${params.serializer_ext}")

script:
// output_prefix = assembly.getBaseName()
Expand Down
2 changes: 1 addition & 1 deletion modules/find_cds.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ process FIND_CDS {
tuple path(assembly), path(bakta_db)

output:
tuple path("CDSS_bakta/${output_prefix}.cds-only.faa"), path("CDSS_bakta/${output_prefix}.cds-only.pkl")
tuple path("CDSS_bakta/${output_prefix}.cds-only.faa"), path("CDSS_bakta/${output_prefix}.cds-only.${params.serializer_ext}")

script:
output_prefix = assembly.getBaseName()
Expand Down
3 changes: 2 additions & 1 deletion modules/find_rnas.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ process FIND_RNAS {
tag "rnas_search"
label "rnas_search"
label 'bakta'
scratch true // DEBUG
publishDir params.outdir, enabled: params.save_intermediate, mode: 'copy'

input:
tuple path(assembly), path(bakta_db)

output:
path("RNAS_bakta/${output_prefix}.rna-only.pkl")
path("RNAS_bakta/${output_prefix}.rna-only.${params.serializer_ext}")

script:
output_prefix = assembly.getBaseName()
Expand Down
3 changes: 2 additions & 1 deletion modules/merge_annotations.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ process MERGE_ANNOTATIONS {
path(bulk_annotations)

output:
path("annotated_pkl/*.pkl"), emit: annotated_pickles
path("annotated_pkl/*.${params.serializer_ext}"), emit: annotated_pickles
path("annotated_pkl"), emit: annotated_dir

script:
"""
merge_annotations_into_pkl.py \
--infile_ext ${params.serializer_ext} \
--pickle_folder . \
--annotations bulk_protein_annotations.json \
--out annotated_pkl
Expand Down
16 changes: 14 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,19 @@ params {
outdir = "./pannotator_results"

// Intermediate files
save_intermediate = false
save_intermediate = true

// Serialization format for intermediate files
serializer = 'pickle'

// Intermediate file extensions
// that should strictly match allowed options in SysBio's Bakta
// (i.e. don't change this parameter's value!)
// TODO: refactor to make more robust
serializer_ext = [
pickle: 'pkl',
json: 'json'
][serializer]

// TODO: add debug flag that sets save_intermediate to `true` and turns on extra logging

Expand Down Expand Up @@ -86,7 +98,7 @@ profiles {
standard {
process {
withLabel:bakta {
container = 'quay.io/d_goryslavets/bakta_pannotator:1.12.0-pannotator.2'
container = 'quay.io/d_goryslavets/bakta_pannotator:1.12.0-pannotator.3'

}
withLabel:clustering {
Expand Down