diff --git a/bin/merge_annotations_into_pkl.py b/bin/merge_annotations_into_pkl.py index 584a01b..ba4e144 100755 --- a/bin/merge_annotations_into_pkl.py +++ b/bin/merge_annotations_into_pkl.py @@ -98,10 +98,10 @@ def process_pickle(pkl_path: Path, annotations: Dict[str, Dict[str, Any]], outpu print(f"Annotated pickle saved to: {output_path}") -def process_input_folder(input_folder: Path, json_path: Path, output_folder: Path) -> None: +def process_input_folder(input_folder: Path, json_path: Path, output_folder: Path, infile_ext: str) -> None: annotations = load_annotations(json_path) - pickle_files = list(input_folder.glob("*.cds-only.pkl")) + pickle_files = list(input_folder.glob(f"*.cds-only.{infile_ext}")) if not pickle_files: print(f"No pickle files found in {input_folder}") @@ -112,7 +112,7 @@ def process_input_folder(input_folder: Path, json_path: Path, output_folder: Pat output_folder.mkdir(parents=True, exist_ok=True) for pkl_path in pickle_files: - new_name = pkl_path.name.replace(".cds-only.pkl", ".cds-annotated.pkl") + new_name = pkl_path.name.replace(f".cds-only.{infile_ext}", f".cds-annotated.{infile_ext}") output_path = output_folder / new_name process_pickle(pkl_path, annotations, output_path) @@ -122,12 +122,14 @@ def process_input_folder(input_folder: Path, json_path: Path, output_folder: Pat def main(): p = argparse.ArgumentParser(description="Annotate CDS pickles with information from JSON annotations") - p.add_argument("--pickle_folder", required=True, help="Folder containing .cds-only.pkl files") + p.add_argument("--pickle_folder", required=True, help="Folder containing .cds-only.`infile_ext` files") + p.add_argument("--infile_ext", action="store", help="Input file extension") p.add_argument("--annotations", required=True, help="bulk_protein_annotations.json file") p.add_argument("--out", default="annotated_pkl", help="Output directory") args = p.parse_args() input_folder = Path(args.pickle_folder) + infile_ext = str(args.infile_ext) json_path = Path(args.annotations) output_folder = Path(args.out) @@ -139,7 +141,7 @@ def main(): print(f"Annotations JSON file does not exist: {json_path}") return - process_input_folder(input_folder, json_path, output_folder) + process_input_folder(input_folder, json_path, output_folder, infile_ext) if __name__ == "__main__": diff --git a/main.nf b/main.nf index 566a4bd..29b5545 100755 --- a/main.nf +++ b/main.nf @@ -78,7 +78,7 @@ workflow { cds_pkl_list_ch = all_cds_outputs .flatten() - .filter { it.name.endsWith('.pkl') } + .filter { it.name.endsWith(".${params.serializer_ext}") } .collect() ch_cds_pkl = cds_pkl_list_ch.flatten() @@ -142,7 +142,7 @@ workflow { ch_rna_pkl = rna_outputs .flatten() - .filter { it.name.endsWith('.pkl') } + .filter { it.name.endsWith(".${params.serializer_ext}") } //----------------------------- // SORF extra search diff --git a/modules/detect_pseudogenes.nf b/modules/detect_pseudogenes.nf index ea9c9de..4d7988e 100644 --- a/modules/detect_pseudogenes.nf +++ b/modules/detect_pseudogenes.nf @@ -8,7 +8,7 @@ process DETECT_PSEUDOGENES { tuple path(manifest_file), path(bakta_db) output: - path("cds_with_pseudogenes/*with_pseudogenes.pkl") + path("cds_with_pseudogenes/*with_pseudogenes.${params.serializer_ext}") script: // output_prefix = assembly.getBaseName() diff --git a/modules/find_cds.nf b/modules/find_cds.nf index 0c28986..89d170b 100644 --- a/modules/find_cds.nf +++ b/modules/find_cds.nf @@ -8,7 +8,7 @@ process FIND_CDS { tuple path(assembly), path(bakta_db) output: - tuple path("CDSS_bakta/${output_prefix}.cds-only.faa"), path("CDSS_bakta/${output_prefix}.cds-only.pkl") + tuple path("CDSS_bakta/${output_prefix}.cds-only.faa"), path("CDSS_bakta/${output_prefix}.cds-only.${params.serializer_ext}") script: output_prefix = assembly.getBaseName() diff --git a/modules/find_rnas.nf b/modules/find_rnas.nf index 433d756..2201045 100644 --- a/modules/find_rnas.nf +++ b/modules/find_rnas.nf @@ -2,13 +2,14 @@ process FIND_RNAS { tag "rnas_search" label "rnas_search" label 'bakta' + scratch true // DEBUG publishDir params.outdir, enabled: params.save_intermediate, mode: 'copy' input: tuple path(assembly), path(bakta_db) output: - path("RNAS_bakta/${output_prefix}.rna-only.pkl") + path("RNAS_bakta/${output_prefix}.rna-only.${params.serializer_ext}") script: output_prefix = assembly.getBaseName() diff --git a/modules/merge_annotations.nf b/modules/merge_annotations.nf index d411574..c8848d4 100644 --- a/modules/merge_annotations.nf +++ b/modules/merge_annotations.nf @@ -9,12 +9,13 @@ process MERGE_ANNOTATIONS { path(bulk_annotations) output: - path("annotated_pkl/*.pkl"), emit: annotated_pickles + path("annotated_pkl/*.${params.serializer_ext}"), emit: annotated_pickles path("annotated_pkl"), emit: annotated_dir script: """ merge_annotations_into_pkl.py \ + --infile_ext ${params.serializer_ext} \ --pickle_folder . \ --annotations bulk_protein_annotations.json \ --out annotated_pkl diff --git a/nextflow.config b/nextflow.config index 5e760bc..4540581 100644 --- a/nextflow.config +++ b/nextflow.config @@ -30,7 +30,19 @@ params { outdir = "./pannotator_results" // Intermediate files - save_intermediate = false + save_intermediate = true + + // Serialization format for intermediate files + serializer = 'pickle' + + // Intermediate file extensions + // that should strictly match allowed options in SysBio's Bakta + // (i.e. don't change this parameter's value!) + // TODO: refactor to make more robust + serializer_ext = [ + pickle: 'pkl', + json: 'json' + ][serializer] // TODO: add debug flag that sets save_intermediate to `true` and turns on extra logging @@ -86,7 +98,7 @@ profiles { standard { process { withLabel:bakta { - container = 'quay.io/d_goryslavets/bakta_pannotator:1.12.0-pannotator.2' + container = 'quay.io/d_goryslavets/bakta_pannotator:1.12.0-pannotator.3' } withLabel:clustering {