From 0dd9ce868d0e9e954f3489354e6388b1d227ca08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Thu, 9 Oct 2025 16:25:00 +0200 Subject: [PATCH 1/9] Add long reads workflow for host or contamination removal --- .../.dockstore.yml | 13 + .../CHANGELOG.md | 5 + .../README.md | 21 + ...tamination-removal-on-long-reads-tests.yml | 183 +++++++ ...-or-contamination-removal-on-long-reads.ga | 454 ++++++++++++++++++ .../plnmotmptestjob4912th98.json | 1 + 6 files changed, 677 insertions(+) create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/.dockstore.yml create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/README.md create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads.ga create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/.dockstore.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/.dockstore.yml new file mode 100644 index 0000000000..802ff1c1de --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/.dockstore.yml @@ -0,0 +1,13 @@ +version: 1.2 +workflows: +- name: main + subclass: Galaxy + publish: true + primaryDescriptorPath: /host-or-contamination-removal-on-long-reads.ga + testParameterFiles: + - /host-or-contamination-removal-on-long-reads-tests.yml + authors: + - name: Paul Zierep + orcid: 0000-0003-2982-388X + - name: "B\xE9r\xE9nice Batut" + orcid: 0000-0001-9852-1987 diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md new file mode 100644 index 0000000000..2bef198049 --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md @@ -0,0 +1,5 @@ +# Changelog + +## [0.1] yyyy-mm-dd + +First release. diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/README.md b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/README.md new file mode 100644 index 0000000000..45aeb3ec82 --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/README.md @@ -0,0 +1,21 @@ +# Host or Contamination removal on long-reads + +The extraction of microbiome DNA or RNA is usually contaminated by host and human DNA or RNA (but also other contaminant). It is an important to get rid of all host/contamination sequences and to only retain microbiome sequences, both in order to speed up further steps and to avoid host/contamination sequences compromising the analysis. + +This workflow takes Nanopore fastq(.gz) files and executes the following steps: +1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Minimap 2**, +2. Filtering of the generated BAM using **BAMtools** and **Samtools** to keep only the reads that do not align, +3. Generation of mapping statistics using **QualiMap**, +2. Aggregation of the mapping statistics using **MultiQC** + +## Input Datasets + +- A list of datasets corresponding to reads in `fastqsanger` or `fastqsanger.gz` format. +- Reference genome +- Profile for mapping + +## Output Datasets + +- A list of datasets corresponding to unmapped reads in `fastqsanger` or `fastqsanger.gz`. +- A list of reports of QualiMap for each sample that could be used as inputs for extra MultiQC +- MultiQC report of the mapping statistics in HTML \ No newline at end of file diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml new file mode 100644 index 0000000000..05c71ec7d6 --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml @@ -0,0 +1,183 @@ +- doc: Test outline for host-or-contamination-removal-on-long-reads + job: + Long-reads: + class: Collection + collection_type: list + elements: + - class: File + identifier: Spike3bBarcode10 + location: https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode10.fastq.gz + filetype: fastqsanger.gz + - class: File + identifier: Spike3bBarcode12 + location: https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode12.fastq.gz + filetype: fastqsanger.gz + Host/Contaminant Reference Genome (long-reads): hg38 + Profile of preset options for the mapping (long-read): map-pb + outputs: + qualimap_stats: + element_tests: + Spike3bBarcode10: + elements: + genome_results: + asserts: + has_text: + text: "Spike3bBarcode10" + has_text: + text: "3,209,286,105 bp" + coverage_across_reference: + asserts: + has_text: + text: "#Position (bp)" + has_n_lines: + value: 854 + coverage_histogram: + asserts: + has_text: + text: "Number of genomic locations" + has_n_lines: + value: 7 + genome_fraction_coverage: + asserts: + has_text: + text: "#Coverage (X)" + has_n_lines: + value: 151 + duplication_rate_histogram: + asserts: + has_text: + text: "#Duplication rate" + has_text: + text: "104.0" + homopolymer_indels: + asserts: + has_text: + text: "#Type of indel" + has_text: + text: "polyN" + insert_size_across_reference: + asserts: + has_size: + value: 0 + insert_size_histogram: + asserts: + has_size: + value: 0 + mapped_reads_clipping_profile: + asserts: + has_text: + text: "#Read position (bp)" + has_text: + text: "6.161988" + mapped_reads_gc-content_distribution: + asserts: + has_text: + text: "#GC Content (%)" + has_n_lines: + value: 100 + mapped_reads_nucleotide_content: + asserts: + has_text: + text: "16.666666" + mapping_quality_across_reference: + asserts: + has_text: + text: "Filtered Reads" + has_n_lines: + value: 854 + mapping_quality_histogram: + asserts: + has_text: + text: "#Mapping quality" + has_n_lines: + value: 41 + Spike3bBarcode12: + elements: + genome_results: + asserts: + has_text: + text: "Spike3bBarcode12" + has_text: + text: "3,209,286,105 bp" + coverage_across_reference: + asserts: + has_text: + text: "#Position (bp)" + has_n_lines: + value: 100 + coverage_histogram: + asserts: + has_text: + text: "Number of genomic locations" + has_n_lines: + value: 4 + genome_fraction_coverage: + asserts: + has_text: + text: "#Coverage (X)" + has_n_lines: + value: 51 + duplication_rate_histogram: + asserts: + has_text: + text: "#Duplication rate" + has_text: + text: "119.0" + homopolymer_indels: + asserts: + has_text: + text: "#Type of indel" + has_text: + text: "polyN" + insert_size_across_reference: + asserts: + has_size: + value: 0 + insert_size_histogram: + asserts: + has_size: + value: 0 + mapped_reads_clipping_profile: + asserts: + has_text: + text: "#Read position (bp)" + has_text: + text: "2.273913" + mapped_reads_gc-content_distribution: + asserts: + has_text: + text: "#GC Content (%)" + has_n_lines: + value: 100 + mapped_reads_nucleotide_content: + asserts: + has_text: + text: "16.666666" + mapping_quality_across_reference: + asserts: + has_text: + text: "Filtered Reads" + has_n_lines: + value: 854 + mapping_quality_histogram: + asserts: + has_text: + text: "#Mapping quality" + has_n_lines: + value: 37 + multiqc_html_report: + asserts: + has_text: + text: "Spike3bBarcode10" + has_text: + text: "Spike3bBarcode12" + samtools_fastx: + element_tests: + Spike3bBarcode10: + asserts: + has_text: + text: "@0a0c4d2c-291f-46a4-87d5-625efbfed6a0" + Spike3bBarcode12: + asserts: + has_text: + text: "@0a0c4e88-893a-4284-9119-ab4274e05445" \ No newline at end of file diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads.ga b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads.ga new file mode 100644 index 0000000000..3a960d9ffb --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads.ga @@ -0,0 +1,454 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "This workflow takes Nanopore fastq(.gz) files and runs Minimap2 to map the reads against a reference genome (human, by default). It filters the output to keep only the unmapped reads and generates mapping statistics that are aggregated into a MultiQC report.", + "comments": [], + "creator": [ + { + "class": "Person", + "identifier": "https://orcid.org/0000-0003-2982-388X", + "name": "Paul Zierep" + }, + { + "class": "Person", + "identifier": "https://orcid.org/0000-0001-9852-1987", + "name": "B\u00e9r\u00e9nice Batut" + } + ], + "format-version": "0.1", + "license": "MIT", + "release": "0.1", + "name": "Host or Contamination removal on long-reads", + "readme": "# Host or Contamination removal on long-reads\n\nThe extraction of microbiome DNA or RNA is usually contaminated by host and human DNA or RNA (but also other contaminant). It is an important to get rid of all host/contamination sequences and to only retain microbiome sequences, both in order to speed up further steps and to avoid host/contamination sequences compromising the analysis.\n\nThis workflow takes Nanopore fastq(.gz) files and executes the following steps:\n1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Minimap 2**,\n2. Filtering of the generated BAM using **BAMtools** and **Samtools** to keep only the reads that do not align,\n3. Generation of mapping statistics using **QualiMap**,\n2. Aggregation of the mapping statistics using **MultiQC**\n\n## Input Datasets\n\n- A list of datasets corresponding to reads in `fastqsanger` or `fastqsanger.gz` format.\n- Reference genome\n- Profile for mapping\n\n## Output Datasets\n\n- A list of datasets corresponding to unmapped reads in `fastqsanger` or `fastqsanger.gz`.\n- A list of reports of QualiMap for each sample that could be used as inputs for extra MultiQC\n- MultiQC report of the mapping statistics in HTML", + "report": { + "markdown": "\n# Workflow Execution Report\n\n## Workflow Inputs\n```galaxy\ninvocation_inputs()\n```\n\n## Workflow Outputs\n```galaxy\ninvocation_outputs()\n```\n\n## Workflow\n```galaxy\nworkflow_display()\n```\n" + }, + "steps": { + "0": { + "annotation": "Reads not mapping to this reference genome will be kept.", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "Reads not mapping to this reference genome will be kept.", + "name": "Host/Contaminant Reference Genome (long-reads)" + } + ], + "label": "Host/Contaminant Reference Genome (long-reads)", + "name": "Input parameter", + "outputs": [], + "position": { + "left": 0, + "top": 0 + }, + "tool_id": null, + "tool_state": "{\"multiple\": false, \"validators\": [], \"restrictOnConnections\": true, \"parameter_type\": \"text\", \"optional\": false}", + "tool_version": null, + "type": "parameter_input", + "uuid": "b88baf2c-7b71-414c-9840-3365a6bb8a27", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "Long-reads as a collection of fastqsanger(.gz) files", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "Long-reads as a collection of fastqsanger(.gz) files", + "name": "Long-reads" + } + ], + "label": "Long-reads", + "name": "Input dataset collection", + "outputs": [], + "position": { + "left": 0, + "top": 160 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null, \"collection_type\": \"list\", \"fields\": null}", + "tool_version": null, + "type": "data_collection_input", + "uuid": "380f0855-f80f-4dc0-a9df-9e93db63186b", + "when": null, + "workflow_outputs": [] + }, + "2": { + "annotation": "Each profile comes with the preconfigured settings mentioned in parentheses. You can customize each profile further in the indexing, mapping and alignment options sections below. If you do not select a profile here, the tool will use the per-parameter defaults listed in the below sections unless you customize them.", + "content_id": null, + "errors": null, + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "Each profile comes with the preconfigured settings mentioned in parentheses. You can customize each profile further in the indexing, mapping and alignment options sections below. If you do not select a profile here, the tool will use the per-parameter defaults listed in the below sections unless you customize them.", + "name": "Profile of preset options for the mapping (long-read)" + } + ], + "label": "Profile of preset options for the mapping (long-read)", + "name": "Input parameter", + "outputs": [], + "position": { + "left": 0, + "top": 300 + }, + "tool_id": null, + "tool_state": "{\"multiple\": false, \"validators\": [], \"restrictOnConnections\": true, \"parameter_type\": \"text\", \"optional\": false}", + "tool_version": null, + "type": "parameter_input", + "uuid": "b765c0c9-10b8-4395-8bd6-7b3141489c00", + "when": null, + "workflow_outputs": [] + }, + "3": { + "annotation": "Map the reads against a reference genome and output the ones not mapping the reference genome", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/minimap2/minimap2/2.28+galaxy2", + "errors": null, + "id": 3, + "input_connections": { + "fastq_input|analysis_type_selector": { + "id": 2, + "output_name": "output" + }, + "fastq_input|fastq_input1": { + "id": 1, + "output_name": "output" + }, + "reference_source|ref_file": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Map with minimap2", + "name": "fastq_input" + }, + { + "description": "runtime parameter for tool Map with minimap2", + "name": "fastq_input" + }, + { + "description": "runtime parameter for tool Map with minimap2", + "name": "reference_source" + } + ], + "label": "minimap2", + "name": "Map with minimap2", + "outputs": [ + { + "name": "alignment_output", + "type": "bam" + } + ], + "position": { + "left": 300, + "top": 140 + }, + "post_job_actions": { + "HideDatasetActionalignment_output": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "alignment_output" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/minimap2/minimap2/2.28+galaxy2", + "tool_shed_repository": { + "changeset_revision": "6945cd53bd2d", + "name": "minimap2", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"alignment_options\": {\"splicing\": {\"splice_mode\": \"preset\", \"__current_case__\": 0}, \"A\": null, \"B\": null, \"O\": null, \"O2\": null, \"E\": null, \"E2\": null, \"z\": null, \"z2\": null, \"s\": null, \"no_end_flt\": true}, \"fastq_input\": {\"fastq_input_selector\": \"single\", \"__current_case__\": 0, \"fastq_input1\": {\"__class__\": \"ConnectedValue\"}, \"analysis_type_selector\": {\"__class__\": \"ConnectedValue\"}}, \"indexing_options\": {\"H\": false, \"k\": null, \"w\": null, \"I\": null}, \"io_options\": {\"output_format\": \"BAM\", \"Q\": false, \"L\": false, \"K\": null, \"cs\": null, \"c\": false, \"eqx\": false, \"Y\": false}, \"mapping_options\": {\"N\": null, \"F\": null, \"f\": null, \"kmer_ocurrence_interval\": {\"interval\": \"\", \"__current_case__\": 1}, \"min_occ_floor\": null, \"q_occ_frac\": \"0.01\", \"g\": null, \"r\": null, \"n\": null, \"m\": null, \"max_chain_skip\": null, \"max_chain_iter\": null, \"X\": false, \"p\": null, \"mask_len\": null}, \"reference_source\": {\"reference_source_selector\": \"cached\", \"__current_case__\": 0, \"ref_file\": {\"__class__\": \"ConnectedValue\"}}, \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.28+galaxy2", + "type": "tool", + "uuid": "b43a7f78-e9d5-49ab-98c9-070514036269", + "when": null, + "workflow_outputs": [] + }, + "4": { + "annotation": "Generation of mapping statistics", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/qualimap_bamqc/qualimap_bamqc/2.3+galaxy0", + "errors": null, + "id": 4, + "input_connections": { + "input1": { + "id": 3, + "output_name": "alignment_output" + } + }, + "inputs": [], + "label": "QualiMap", + "name": "QualiMap BamQC", + "outputs": [ + { + "name": "raw_data", + "type": "input" + }, + { + "name": "output_html", + "type": "html" + } + ], + "position": { + "left": 561.8009561567164, + "top": 126.87561946128731 + }, + "post_job_actions": { + "HideDatasetActionoutput_html": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output_html" + }, + "HideDatasetActionraw_data": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "raw_data" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/qualimap_bamqc/qualimap_bamqc/2.3+galaxy0", + "tool_shed_repository": { + "changeset_revision": "30a201c9c310", + "name": "qualimap_bamqc", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"duplicate_skipping\": \"0\", \"input1\": {\"__class__\": \"ConnectedValue\"}, \"per_base_coverage\": false, \"plot_specific\": {\"n_bins\": \"400\", \"paint_chromosome_limits\": true, \"genome_gc_distr\": null, \"homopolymer_size\": \"3\"}, \"stats_regions\": {\"region_select\": \"all\", \"__current_case__\": 0}, \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.3+galaxy0", + "type": "tool", + "uuid": "ff81b1f0-47ef-4a70-b23b-a009aeb0711b", + "when": null, + "workflow_outputs": [ + { + "label": "qualimap_stats", + "output_name": "raw_data", + "uuid": "7df71ec2-03a9-4385-9a24-d57b2ad3360d" + } + ] + }, + "5": { + "annotation": "Split BAM into mapped and unmapped", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/bamtools_split_mapped/bamtools_split_mapped/2.5.2+galaxy2", + "errors": null, + "id": 5, + "input_connections": { + "input_bam": { + "id": 3, + "output_name": "alignment_output" + } + }, + "inputs": [], + "label": "Split BAM", + "name": "Split BAM by reads mapping status", + "outputs": [ + { + "name": "mapped", + "type": "bam" + }, + { + "name": "unmapped", + "type": "bam" + } + ], + "position": { + "left": 600, + "top": 370 + }, + "post_job_actions": { + "HideDatasetActionmapped": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "mapped" + }, + "HideDatasetActionunmapped": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "unmapped" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/bamtools_split_mapped/bamtools_split_mapped/2.5.2+galaxy2", + "tool_shed_repository": { + "changeset_revision": "fa7b5520ae53", + "name": "bamtools_split_mapped", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"input_bam\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.5.2+galaxy2", + "type": "tool", + "uuid": "56141fe0-3b82-4005-8035-9369f30511d5", + "when": null, + "workflow_outputs": [] + }, + "6": { + "annotation": "Prepare QualiMap stats for MultiQC", + "content_id": "__FLATTEN__", + "errors": null, + "id": 6, + "input_connections": { + "input": { + "id": 4, + "output_name": "raw_data" + } + }, + "inputs": [], + "label": "Flatten collection", + "name": "Flatten collection", + "outputs": [ + { + "name": "output", + "type": "input" + } + ], + "position": { + "left": 838.3582089552239, + "top": 78.05970149253731 + }, + "post_job_actions": {}, + "tool_id": "__FLATTEN__", + "tool_state": "{\"input\": {\"__class__\": \"ConnectedValue\"}, \"join_identifier\": \"_\", \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.0.0", + "type": "tool", + "uuid": "0b98eb14-0255-4f48-8497-4f7f9d97bc35", + "when": null, + "workflow_outputs": [ + { + "label": "QualiMap mapping statistics", + "output_name": "output", + "uuid": "1944f8cc-e22e-4dec-a8da-325bb3165e8b" + } + ] + }, + "7": { + "annotation": "Extractions of FastQ from the unmapped reads", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/samtools_fastx/samtools_fastx/1.21+galaxy0", + "errors": null, + "id": 7, + "input_connections": { + "input": { + "id": 5, + "output_name": "unmapped" + } + }, + "inputs": [], + "label": null, + "name": "Samtools fastx", + "outputs": [ + { + "name": "output", + "type": "fasta" + } + ], + "position": { + "left": 900, + "top": 390 + }, + "post_job_actions": { + "RenameDatasetActionoutput": { + "action_arguments": { + "newname": "Reads without host or contamination reads" + }, + "action_type": "RenameDatasetAction", + "output_name": "output" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/samtools_fastx/samtools_fastx/1.21+galaxy0", + "tool_shed_repository": { + "changeset_revision": "9038311ed624", + "name": "samtools_fastx", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"copy_arb_tags\": null, \"copy_tags\": false, \"exclusive_filter\": [\"256\", \"2048\"], \"exclusive_filter_all\": null, \"idxout_cond\": {\"idxout_select\": \"no\", \"__current_case__\": 0}, \"inclusive_filter\": null, \"input\": {\"__class__\": \"ConnectedValue\"}, \"output_fmt_cond\": {\"output_fmt_select\": \"fastqsanger\", \"__current_case__\": 0, \"default_quality\": null, \"output_quality\": false, \"ilumina_casava\": false}, \"outputs\": \"other\", \"read_numbering\": \"\", \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.21+galaxy0", + "type": "tool", + "uuid": "2e9642e7-4404-4d9b-9530-912c62a7a665", + "when": null, + "workflow_outputs": [ + { + "label": "samtools_fastx", + "output_name": "output", + "uuid": "1a82e66c-24c1-43f9-9c3c-121bdb895f74" + } + ] + }, + "8": { + "annotation": "Aggregation of the mapping statistics for all samples", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.27+galaxy3", + "errors": null, + "id": 8, + "input_connections": { + "results_0|software_cond|input": { + "id": 6, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool MultiQC", + "name": "image_content_input" + } + ], + "label": "MultiQC", + "name": "MultiQC", + "outputs": [ + { + "name": "html_report", + "type": "html" + }, + { + "name": "stats", + "type": "tabular" + } + ], + "position": { + "left": 1078.5173740671642, + "top": 129.26367916277985 + }, + "post_job_actions": { + "HideDatasetActionstats": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "stats" + }, + "RenameDatasetActionhtml_report": { + "action_arguments": { + "newname": "MultiQC HTML report" + }, + "action_type": "RenameDatasetAction", + "output_name": "html_report" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.27+galaxy3", + "tool_shed_repository": { + "changeset_revision": "31c42a2c02d3", + "name": "multiqc", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"comment\": \"\", \"export\": false, \"flat\": false, \"image_content_input\": {\"__class__\": \"RuntimeValue\"}, \"results\": [{\"__index__\": 0, \"software_cond\": {\"software\": \"qualimap\", \"__current_case__\": 20, \"input\": {\"__class__\": \"ConnectedValue\"}}}], \"title\": \"Host/Contamination Removal\", \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.27+galaxy3", + "type": "tool", + "uuid": "9bfa844d-9fee-4589-8568-80070c6c6479", + "when": null, + "workflow_outputs": [ + { + "label": "multiqc_html_report", + "output_name": "html_report", + "uuid": "62e6de76-2033-413e-9f5c-7f97a3e1741d" + } + ] + } + }, + "tags": [ + "microbiome", + "contamination", + "long_reads" + ], + "uuid": "3af8ce70-dc73-42c1-9d20-1107f84c5395", + "version": 9 +} \ No newline at end of file diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json new file mode 100644 index 0000000000..edb9b7bb9b --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json @@ -0,0 +1 @@ +{"Long-reads": {"class": "Collection", "collection_type": "list", "elements": [{"class": "File", "identifier": "Spike3bBarcode10", "location": "https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode10.fastq.gz", "filetype": "fastqsanger.gz"}, {"class": "File", "identifier": "Spike3bBarcode12", "location": "https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode12.fastq.gz", "filetype": "fastqsanger.gz"}]}, "Host/Contaminant Reference Genome (long-reads)": "hg38", "Profile of preset options for the mapping (long-read)": "map-pb"} \ No newline at end of file From 8d2a487898704d65208cf7fa89b7033553af3c8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Thu, 9 Oct 2025 17:01:34 +0200 Subject: [PATCH 2/9] Add short reads workflow for host or contamination removal --- .../.dockstore.yml | 13 + .../CHANGELOG.md | 5 + .../README.md | 18 ++ ...amination-removal-on-short-reads-tests.yml | 34 ++ ...or-contamination-removal-on-short-reads.ga | 299 ++++++++++++++++++ 5 files changed, 369 insertions(+) create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/.dockstore.yml create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/CHANGELOG.md create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/README.md create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml create mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads.ga diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/.dockstore.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/.dockstore.yml new file mode 100644 index 0000000000..f91dcc645a --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/.dockstore.yml @@ -0,0 +1,13 @@ +version: 1.2 +workflows: +- name: main + subclass: Galaxy + publish: true + primaryDescriptorPath: /host-or-contamination-removal-on-short-reads.ga + testParameterFiles: + - /host-or-contamination-removal-on-short-reads-tests.yml + authors: + - name: Paul Zierep + orcid: 0000-0003-2982-388X + - name: "B\xE9r\xE9nice Batut" + orcid: 0000-0001-9852-1987 diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/CHANGELOG.md b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/CHANGELOG.md new file mode 100644 index 0000000000..2bef198049 --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/CHANGELOG.md @@ -0,0 +1,5 @@ +# Changelog + +## [0.1] yyyy-mm-dd + +First release. diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/README.md b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/README.md new file mode 100644 index 0000000000..929664e4a2 --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/README.md @@ -0,0 +1,18 @@ +# Host or Contamination removal on short-reads + +The extraction of microbiome DNA or RNA is usually contaminated by host and human DNA or RNA (but also other contaminant). It is an important to get rid of all host/contamination sequences and to only retain microbiome sequences, both in order to speed up further steps and to avoid host/contamination sequences compromising the analysis. + +This workflow takes paired-end Illumina fastq(.gz) files and executes the following steps: +1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Bowtie 2** +2. Aggregation of the mapping reports using **MultiQC** + +## Input Datasets + +- A list of paired datasets corresponding to paired-end reads in `fastqsanger` or `fastqsanger.gz` format. +- Reference genome + +## Output Datasets + +- A list of paired datasets corresponding to paired-end reads without the reads mapping to the reference genomes, in `fastqsanger` or `fastqsanger.gz`. +- List of `JSON` reports of Bowtie2 for each sample that could be used as inputs for extra MultiQC +- MultiQC report of the mapping statistics in HTML \ No newline at end of file diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml new file mode 100644 index 0000000000..7e7e5034d0 --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml @@ -0,0 +1,34 @@ +- doc: Test outline for Host-or-Contamination-removal-on-short-reads + job: + Short-reads: + class: Collection + collection_type: list:paired + elements: + - class: Collection + type: paired + identifier: pair + elements: + - class: File + identifier: forward + location: https://zenodo.org/records/15089018/files/MAG_reads_forward.fastqsanger.gz + filetype: fastqsanger.gz + - class: File + identifier: reverse + location: https://zenodo.org/records/15089018/files/MAG_reads_reverse.fastqsanger.gz + filetype: fastqsanger.gz + Host/Contaminant Reference Genome: hg38full + outputs: + multiqc_html_report: + asserts: + has_text: + text: "pair" + has_text: + text: "Bowtie" + bowtie2_mapping_statistics: + element_tests: + pair: + asserts: + has_text: + text: "9462 reads" + has_n_lines: + value: 15 \ No newline at end of file diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads.ga b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads.ga new file mode 100644 index 0000000000..46da67999f --- /dev/null +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads.ga @@ -0,0 +1,299 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "This workflow takes paired-end Illumina fastq(.gz) files and runs Bowtie to map the reads against a reference genome (human, by default) and keep only the reads that do not align. MultiQC is used to aggregate the mapping reports.", + "comments": [], + "creator": [ + { + "class": "Person", + "identifier": "https://orcid.org/0000-0003-2982-388X", + "name": "Paul Zierep" + }, + { + "class": "Person", + "identifier": "https://orcid.org/0000-0001-9852-1987", + "name": "Bérénice Batut" + } + ], + "format-version": "0.1", + "license": "MIT", + "release": "0.1", + "name": "Host or Contamination removal on short-reads", + "readme": "# Host or Contamination removal on short-reads\n\nThe extraction of microbiome DNA or RNA is usually contaminated by host and human DNA or RNA (but also other contaminant). It is an important to get rid of all host/contamination sequences and to only retain microbiome sequences, both in order to speed up further steps and to avoid host/contamination sequences compromising the analysis.\n\nThis workflow takes paired-end Illumina fastq(.gz) files and executes the following steps:\n1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Bowtie 2**\n2. Aggregation of the mapping reports using **MultiQC**\n\n## Input Datasets\n\n- A list of paired datasets corresponding to paired-end reads in `fastqsanger` or `fastqsanger.gz` format.\n- Reference genome\n\n## Output Datasets\n\n- A list of paired datasets corresponding to paired-end reads without the reads mapping to the reference genomes, in `fastqsanger` or `fastqsanger.gz`.\n- List of `JSON` reports of Bowtie2 for each sample that could be used as inputs for extra MultiQC\n- MultiQC report of the mapping statistics in HTML", + "report": { + "markdown": "\n# Workflow Execution Report\n\n## Workflow Inputs\n```galaxy\ninvocation_inputs()\n```\n\n## Workflow Outputs\n```galaxy\ninvocation_outputs()\n```\n\n## Workflow\n```galaxy\nworkflow_display()\n```\n" + }, + "steps": { + "0": { + "annotation": "Short-reads as a paired-end collection of fastqsanger(.gz) files", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "Short-reads as a paired-end collection of fastqsanger(.gz) files", + "name": "Short-reads" + } + ], + "label": "Short-reads", + "name": "Input dataset collection", + "outputs": [], + "position": { + "left": 0, + "top": 0 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"tag\": null, \"collection_type\": \"list:paired\", \"fields\": null}", + "tool_version": null, + "type": "data_collection_input", + "uuid": "f0ca536e-1255-4b44-8fad-73adec33ff74", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "Reads not mapping to this reference genome will be kept.", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "Reads not mapping to this reference genome will be kept.", + "name": "Host/Contaminant Reference Genome" + } + ], + "label": "Host/Contaminant Reference Genome", + "name": "Input parameter", + "outputs": [], + "position": { + "left": 0, + "top": 140 + }, + "tool_id": null, + "tool_state": "{\"multiple\": false, \"validators\": [], \"restrictOnConnections\": true, \"parameter_type\": \"text\", \"optional\": false}", + "tool_version": null, + "type": "parameter_input", + "uuid": "af636432-a9d2-4b0f-ba42-5eaa1cbe750a", + "when": null, + "workflow_outputs": [] + }, + "2": { + "annotation": "Map the reads against a reference genome and output the ones not mapping the reference genome", + "content_id": "toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.5.3+galaxy1", + "errors": null, + "id": 2, + "input_connections": { + "library|input_1": { + "id": 0, + "output_name": "output" + }, + "reference_genome|index": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Bowtie2", + "name": "library" + }, + { + "description": "runtime parameter for tool Bowtie2", + "name": "reference_genome" + } + ], + "label": "Bowtie2", + "name": "Bowtie2", + "outputs": [ + { + "name": "output_unaligned_reads_l", + "type": "fastqsanger" + }, + { + "name": "output_unaligned_reads_r", + "type": "fastqsanger" + }, + { + "name": "output", + "type": "bam" + }, + { + "name": "mapping_stats", + "type": "txt" + } + ], + "position": { + "left": 310, + "top": 0 + }, + "post_job_actions": { + "HideDatasetActionoutput": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output" + }, + "HideDatasetActionoutput_unaligned_reads_l": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output_unaligned_reads_l" + }, + "HideDatasetActionoutput_unaligned_reads_r": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "output_unaligned_reads_r" + }, + "RenameDatasetActionmapping_stats": { + "action_arguments": { + "newname": "Bowtie2 mapping statistics" + }, + "action_type": "RenameDatasetAction", + "output_name": "mapping_stats" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/2.5.3+galaxy1", + "tool_shed_repository": { + "changeset_revision": "d5ceb9f3c25b", + "name": "bowtie2", + "owner": "devteam", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"analysis_type\": {\"analysis_type_selector\": \"simple\", \"__current_case__\": 0, \"presets\": \"no_presets\"}, \"library\": {\"type\": \"paired_collection\", \"__current_case__\": 2, \"input_1\": {\"__class__\": \"ConnectedValue\"}, \"unaligned_file\": true, \"aligned_file\": false, \"paired_options\": {\"paired_options_selector\": \"no\", \"__current_case__\": 1}}, \"reference_genome\": {\"source\": \"indexed\", \"__current_case__\": 0, \"index\": {\"__class__\": \"ConnectedValue\"}}, \"rg\": {\"rg_selector\": \"do_not_set\", \"__current_case__\": 3}, \"sam_options\": {\"sam_options_selector\": \"no\", \"__current_case__\": 1}, \"save_mapping_stats\": true, \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.5.3+galaxy1", + "type": "tool", + "uuid": "4f0ed86e-4ab8-4737-a980-41405a6c5295", + "when": null, + "workflow_outputs": [ + { + "label": "bowtie2_mapping_statistics", + "output_name": "mapping_stats", + "uuid": "76c58935-0b96-4549-8ff4-bd17952c903f" + } + ] + }, + "3": { + "annotation": "Take two collections and create a paired collection from them.", + "content_id": "__ZIP_COLLECTION__", + "errors": null, + "id": 3, + "input_connections": { + "input_forward": { + "id": 2, + "output_name": "output_unaligned_reads_l" + }, + "input_reverse": { + "id": 2, + "output_name": "output_unaligned_reads_r" + } + }, + "inputs": [], + "label": "Create a paired collection", + "name": "Zip collections", + "outputs": [ + { + "name": "output", + "type": "input" + } + ], + "position": { + "left": 620, + "top": 0 + }, + "post_job_actions": { + "RenameDatasetActionoutput": { + "action_arguments": { + "newname": "Reads without host or contaminant reads" + }, + "action_type": "RenameDatasetAction", + "output_name": "output" + } + }, + "tool_id": "__ZIP_COLLECTION__", + "tool_state": "{\"input_forward\": {\"__class__\": \"ConnectedValue\"}, \"input_reverse\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.0.0", + "type": "tool", + "uuid": "63e069fd-7c6d-48e1-b891-9ad0a93a5516", + "when": null, + "workflow_outputs": [ + { + "label": "contamination_filtered_reads", + "output_name": "output", + "uuid": "02423350-1d61-4be2-a743-9dca7bae63b8" + } + ] + }, + "4": { + "annotation": "Aggregation of the mapping statistics for all samples", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.27+galaxy3", + "errors": null, + "id": 4, + "input_connections": { + "results_0|software_cond|input": { + "id": 2, + "output_name": "mapping_stats" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool MultiQC", + "name": "image_content_input" + } + ], + "label": "MultiQC", + "name": "MultiQC", + "outputs": [ + { + "name": "html_report", + "type": "html" + }, + { + "name": "stats", + "type": "tabular" + } + ], + "position": { + "left": 620, + "top": 240 + }, + "post_job_actions": { + "HideDatasetActionstats": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "stats" + }, + "RenameDatasetActionhtml_report": { + "action_arguments": { + "newname": "MultiQC HTML report" + }, + "action_type": "RenameDatasetAction", + "output_name": "html_report" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.27+galaxy3", + "tool_shed_repository": { + "changeset_revision": "31c42a2c02d3", + "name": "multiqc", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"comment\": \"\", \"export\": false, \"flat\": false, \"image_content_input\": {\"__class__\": \"RuntimeValue\"}, \"results\": [{\"__index__\": 0, \"software_cond\": {\"software\": \"bowtie2\", \"__current_case__\": 3, \"input\": {\"__class__\": \"ConnectedValue\"}}}], \"title\": \"Host Removal\", \"__page__\": 0, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.27+galaxy3", + "type": "tool", + "uuid": "c982928f-6392-4608-a8c7-88046300a4b1", + "when": null, + "workflow_outputs": [ + { + "label": "multiqc_html_report", + "output_name": "html_report", + "uuid": "62e6de76-2033-413e-9f5c-7f97a3e1741d" + } + ] + } + }, + "tags": [ + "microbiome", + "contamination", + "short_reads" + ], + "uuid": "0fc16104-af45-48b0-bec5-6540e3dc2114", + "version": 10 +} \ No newline at end of file From d118fe5e94b38ab52c17e7a39ebca850c008438d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Thu, 13 Nov 2025 11:33:54 +0100 Subject: [PATCH 3/9] Update workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml Co-authored-by: paulzierep --- .../host-or-contamination-removal-on-short-reads-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml index 7e7e5034d0..1c0f22f4b7 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml @@ -31,4 +31,4 @@ has_text: text: "9462 reads" has_n_lines: - value: 15 \ No newline at end of file + n: 15 \ No newline at end of file From c03b9dc50acd2270841f4360a72cc2874377415d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Wed, 3 Dec 2025 14:33:47 +0100 Subject: [PATCH 4/9] Update workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml Co-authored-by: paulzierep --- .../host-or-contamination-removal-on-short-reads-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml index 1c0f22f4b7..93c7ead82b 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml @@ -16,7 +16,7 @@ identifier: reverse location: https://zenodo.org/records/15089018/files/MAG_reads_reverse.fastqsanger.gz filetype: fastqsanger.gz - Host/Contaminant Reference Genome: hg38full + Host/Contaminant Reference Genome: hg38 outputs: multiqc_html_report: asserts: From d99c8899228942a8ff06ffb1583cd13d0440e984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Wed, 3 Dec 2025 15:14:04 +0100 Subject: [PATCH 5/9] Delete workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json --- .../plnmotmptestjob4912th98.json | 1 - 1 file changed, 1 deletion(-) delete mode 100644 workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json deleted file mode 100644 index edb9b7bb9b..0000000000 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/plnmotmptestjob4912th98.json +++ /dev/null @@ -1 +0,0 @@ -{"Long-reads": {"class": "Collection", "collection_type": "list", "elements": [{"class": "File", "identifier": "Spike3bBarcode10", "location": "https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode10.fastq.gz", "filetype": "fastqsanger.gz"}, {"class": "File", "identifier": "Spike3bBarcode12", "location": "https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode12.fastq.gz", "filetype": "fastqsanger.gz"}]}, "Host/Contaminant Reference Genome (long-reads)": "hg38", "Profile of preset options for the mapping (long-read)": "map-pb"} \ No newline at end of file From 1a313fbebac02202341d6292d4469b97360e91c7 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Wed, 3 Dec 2025 16:42:26 +0100 Subject: [PATCH 6/9] Fix workflow output labels and improve documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address PR review feedback for #991: - Update workflow output labels to use human-readable names without underscores: * Long-reads: "qualimap_stats" → "QualiMap Statistics" * Long-reads: "samtools_fastx" → "Reads without Host or Contamination" * Long-reads: "multiqc_html_report" → "MultiQC HTML Report" * Short-reads: "bowtie2_mapping_statistics" → "Bowtie2 Mapping Statistics" * Short-reads: "contamination_filtered_reads" → "Contamination Filtered Reads" * Short-reads: "multiqc_html_report" → "MultiQC HTML Report" - Update corresponding labels in test files to match workflow outputs - Fix workflow name capitalization: * "removal" → "Removal" in both workflow names - Update CHANGELOG dates from "yyyy-mm-dd" to actual date (2025-12-03) - Improve README documentation: * Fix step numbering in long-reads README (was: 1,2,3,2; now: 1,2,3,4) * Add "When to use this workflow" sections to both READMEs * Cross-reference between long-reads and short-reads workflows 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../CHANGELOG.md | 6 +++--- .../README.md | 14 +++++++++----- ...r-contamination-removal-on-long-reads-tests.yml | 6 +++--- .../host-or-contamination-removal-on-long-reads.ga | 8 ++++---- .../CHANGELOG.md | 6 +++--- .../README.md | 6 +++++- ...-contamination-removal-on-short-reads-tests.yml | 4 ++-- ...host-or-contamination-removal-on-short-reads.ga | 8 ++++---- 8 files changed, 33 insertions(+), 25 deletions(-) diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md index 2bef198049..dcf147a601 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/CHANGELOG.md @@ -1,5 +1,5 @@ # Changelog - -## [0.1] yyyy-mm-dd - + +## [0.1] 2025-12-03 + First release. diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/README.md b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/README.md index 45aeb3ec82..bc63be96b2 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/README.md +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/README.md @@ -3,10 +3,10 @@ The extraction of microbiome DNA or RNA is usually contaminated by host and human DNA or RNA (but also other contaminant). It is an important to get rid of all host/contamination sequences and to only retain microbiome sequences, both in order to speed up further steps and to avoid host/contamination sequences compromising the analysis. This workflow takes Nanopore fastq(.gz) files and executes the following steps: -1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Minimap 2**, -2. Filtering of the generated BAM using **BAMtools** and **Samtools** to keep only the reads that do not align, -3. Generation of mapping statistics using **QualiMap**, -2. Aggregation of the mapping statistics using **MultiQC** +1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Minimap 2** +2. Filtering of the generated BAM using **BAMtools** and **Samtools** to keep only the reads that do not align +3. Generation of mapping statistics using **QualiMap** +4. Aggregation of the mapping statistics using **MultiQC** ## Input Datasets @@ -18,4 +18,8 @@ This workflow takes Nanopore fastq(.gz) files and executes the following steps: - A list of datasets corresponding to unmapped reads in `fastqsanger` or `fastqsanger.gz`. - A list of reports of QualiMap for each sample that could be used as inputs for extra MultiQC -- MultiQC report of the mapping statistics in HTML \ No newline at end of file +- MultiQC report of the mapping statistics in HTML + +## When to use this workflow + +Use this workflow for **long-read sequencing data** (e.g., Nanopore, PacBio). For short-read Illumina data, see the [Host or Contamination removal on short-reads](../host-contamination-removal-short-reads/) workflow. \ No newline at end of file diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml index 05c71ec7d6..9a3f5b49e8 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml @@ -15,7 +15,7 @@ Host/Contaminant Reference Genome (long-reads): hg38 Profile of preset options for the mapping (long-read): map-pb outputs: - qualimap_stats: + QualiMap Statistics: element_tests: Spike3bBarcode10: elements: @@ -165,13 +165,13 @@ text: "#Mapping quality" has_n_lines: value: 37 - multiqc_html_report: + MultiQC HTML Report: asserts: has_text: text: "Spike3bBarcode10" has_text: text: "Spike3bBarcode12" - samtools_fastx: + Reads without Host or Contamination: element_tests: Spike3bBarcode10: asserts: diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads.ga b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads.ga index 3a960d9ffb..edc730ac99 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads.ga +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads.ga @@ -17,7 +17,7 @@ "format-version": "0.1", "license": "MIT", "release": "0.1", - "name": "Host or Contamination removal on long-reads", + "name": "Host or Contamination Removal on Long-Reads", "readme": "# Host or Contamination removal on long-reads\n\nThe extraction of microbiome DNA or RNA is usually contaminated by host and human DNA or RNA (but also other contaminant). It is an important to get rid of all host/contamination sequences and to only retain microbiome sequences, both in order to speed up further steps and to avoid host/contamination sequences compromising the analysis.\n\nThis workflow takes Nanopore fastq(.gz) files and executes the following steps:\n1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Minimap 2**,\n2. Filtering of the generated BAM using **BAMtools** and **Samtools** to keep only the reads that do not align,\n3. Generation of mapping statistics using **QualiMap**,\n2. Aggregation of the mapping statistics using **MultiQC**\n\n## Input Datasets\n\n- A list of datasets corresponding to reads in `fastqsanger` or `fastqsanger.gz` format.\n- Reference genome\n- Profile for mapping\n\n## Output Datasets\n\n- A list of datasets corresponding to unmapped reads in `fastqsanger` or `fastqsanger.gz`.\n- A list of reports of QualiMap for each sample that could be used as inputs for extra MultiQC\n- MultiQC report of the mapping statistics in HTML", "report": { "markdown": "\n# Workflow Execution Report\n\n## Workflow Inputs\n```galaxy\ninvocation_inputs()\n```\n\n## Workflow Outputs\n```galaxy\ninvocation_outputs()\n```\n\n## Workflow\n```galaxy\nworkflow_display()\n```\n" @@ -224,7 +224,7 @@ "when": null, "workflow_outputs": [ { - "label": "qualimap_stats", + "label": "QualiMap Statistics", "output_name": "raw_data", "uuid": "7df71ec2-03a9-4385-9a24-d57b2ad3360d" } @@ -370,7 +370,7 @@ "when": null, "workflow_outputs": [ { - "label": "samtools_fastx", + "label": "Reads without Host or Contamination", "output_name": "output", "uuid": "1a82e66c-24c1-43f9-9c3c-121bdb895f74" } @@ -437,7 +437,7 @@ "when": null, "workflow_outputs": [ { - "label": "multiqc_html_report", + "label": "MultiQC HTML Report", "output_name": "html_report", "uuid": "62e6de76-2033-413e-9f5c-7f97a3e1741d" } diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/CHANGELOG.md b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/CHANGELOG.md index 2bef198049..dcf147a601 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/CHANGELOG.md +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/CHANGELOG.md @@ -1,5 +1,5 @@ # Changelog - -## [0.1] yyyy-mm-dd - + +## [0.1] 2025-12-03 + First release. diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/README.md b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/README.md index 929664e4a2..a060198b91 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/README.md +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/README.md @@ -15,4 +15,8 @@ This workflow takes paired-end Illumina fastq(.gz) files and executes the follow - A list of paired datasets corresponding to paired-end reads without the reads mapping to the reference genomes, in `fastqsanger` or `fastqsanger.gz`. - List of `JSON` reports of Bowtie2 for each sample that could be used as inputs for extra MultiQC -- MultiQC report of the mapping statistics in HTML \ No newline at end of file +- MultiQC report of the mapping statistics in HTML + +## When to use this workflow + +Use this workflow for **short-read paired-end Illumina sequencing data**. For long-read data (Nanopore, PacBio), see the [Host or Contamination removal on long-reads](../host-contamination-removal-long-reads/) workflow. \ No newline at end of file diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml index 93c7ead82b..fe5d5e078a 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads-tests.yml @@ -18,13 +18,13 @@ filetype: fastqsanger.gz Host/Contaminant Reference Genome: hg38 outputs: - multiqc_html_report: + MultiQC HTML Report: asserts: has_text: text: "pair" has_text: text: "Bowtie" - bowtie2_mapping_statistics: + Bowtie2 Mapping Statistics: element_tests: pair: asserts: diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads.ga b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads.ga index 46da67999f..dcc47487bb 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads.ga +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-short-reads/host-or-contamination-removal-on-short-reads.ga @@ -17,7 +17,7 @@ "format-version": "0.1", "license": "MIT", "release": "0.1", - "name": "Host or Contamination removal on short-reads", + "name": "Host or Contamination Removal on Short-Reads", "readme": "# Host or Contamination removal on short-reads\n\nThe extraction of microbiome DNA or RNA is usually contaminated by host and human DNA or RNA (but also other contaminant). It is an important to get rid of all host/contamination sequences and to only retain microbiome sequences, both in order to speed up further steps and to avoid host/contamination sequences compromising the analysis.\n\nThis workflow takes paired-end Illumina fastq(.gz) files and executes the following steps:\n1. Mapping of the reads against a reference genome of the host or contaminant (e.g. human) using **Bowtie 2**\n2. Aggregation of the mapping reports using **MultiQC**\n\n## Input Datasets\n\n- A list of paired datasets corresponding to paired-end reads in `fastqsanger` or `fastqsanger.gz` format.\n- Reference genome\n\n## Output Datasets\n\n- A list of paired datasets corresponding to paired-end reads without the reads mapping to the reference genomes, in `fastqsanger` or `fastqsanger.gz`.\n- List of `JSON` reports of Bowtie2 for each sample that could be used as inputs for extra MultiQC\n- MultiQC report of the mapping statistics in HTML", "report": { "markdown": "\n# Workflow Execution Report\n\n## Workflow Inputs\n```galaxy\ninvocation_inputs()\n```\n\n## Workflow Outputs\n```galaxy\ninvocation_outputs()\n```\n\n## Workflow\n```galaxy\nworkflow_display()\n```\n" @@ -164,7 +164,7 @@ "when": null, "workflow_outputs": [ { - "label": "bowtie2_mapping_statistics", + "label": "Bowtie2 Mapping Statistics", "output_name": "mapping_stats", "uuid": "76c58935-0b96-4549-8ff4-bd17952c903f" } @@ -215,7 +215,7 @@ "when": null, "workflow_outputs": [ { - "label": "contamination_filtered_reads", + "label": "Contamination Filtered Reads", "output_name": "output", "uuid": "02423350-1d61-4be2-a743-9dca7bae63b8" } @@ -282,7 +282,7 @@ "when": null, "workflow_outputs": [ { - "label": "multiqc_html_report", + "label": "MultiQC HTML Report", "output_name": "html_report", "uuid": "62e6de76-2033-413e-9f5c-7f97a3e1741d" } From 1659aacd74b77642c46f2f3aab4f12cd9c2b9371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Fri, 5 Dec 2025 10:46:15 +0100 Subject: [PATCH 7/9] Try to fix tests --- .../host-or-contamination-removal-on-long-reads-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml index 9a3f5b49e8..623f9b3c13 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml @@ -82,7 +82,7 @@ mapping_quality_across_reference: asserts: has_text: - text: "Filtered Reads" + text: "#Position (bp)" has_n_lines: value: 854 mapping_quality_histogram: @@ -156,7 +156,7 @@ mapping_quality_across_reference: asserts: has_text: - text: "Filtered Reads" + text: "#Position (bp)" has_n_lines: value: 854 mapping_quality_histogram: From dc7007288d5f85072f314720458b6c2f64ec2f9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Fri, 5 Dec 2025 16:18:52 +0100 Subject: [PATCH 8/9] Use smaller index --- .../host-or-contamination-removal-on-long-reads-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml index 623f9b3c13..aa063894ea 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml @@ -12,7 +12,7 @@ identifier: Spike3bBarcode12 location: https://zenodo.org/record/12190648/files/collection_of_all_samples_Spike3bBarcode12.fastq.gz filetype: fastqsanger.gz - Host/Contaminant Reference Genome (long-reads): hg38 + Host/Contaminant Reference Genome (long-reads): apiMel3 Profile of preset options for the mapping (long-read): map-pb outputs: QualiMap Statistics: From 7420550fd628e27542656f01ace53e205f6dc944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Fri, 5 Dec 2025 17:08:46 +0100 Subject: [PATCH 9/9] Fix tests --- ...tamination-removal-on-long-reads-tests.yml | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml index aa063894ea..811693d354 100644 --- a/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml +++ b/workflows/microbiome/host-contamination-removal/host-contamination-removal-long-reads/host-or-contamination-removal-on-long-reads-tests.yml @@ -24,31 +24,31 @@ has_text: text: "Spike3bBarcode10" has_text: - text: "3,209,286,105 bp" + text: "586,300,787 bp" coverage_across_reference: asserts: has_text: text: "#Position (bp)" has_n_lines: - value: 854 + value: 416 coverage_histogram: asserts: has_text: text: "Number of genomic locations" has_n_lines: - value: 7 + value: 10 genome_fraction_coverage: asserts: has_text: text: "#Coverage (X)" has_n_lines: - value: 151 + value: 51 duplication_rate_histogram: asserts: has_text: text: "#Duplication rate" has_text: - text: "104.0" + text: "17.0" homopolymer_indels: asserts: has_text: @@ -68,7 +68,7 @@ has_text: text: "#Read position (bp)" has_text: - text: "6.161988" + text: "38.123" mapped_reads_gc-content_distribution: asserts: has_text: @@ -78,19 +78,19 @@ mapped_reads_nucleotide_content: asserts: has_text: - text: "16.666666" + text: "6.25" mapping_quality_across_reference: asserts: has_text: text: "#Position (bp)" has_n_lines: - value: 854 + value: 416 mapping_quality_histogram: asserts: has_text: text: "#Mapping quality" has_n_lines: - value: 41 + value: 13 Spike3bBarcode12: elements: genome_results: @@ -98,19 +98,19 @@ has_text: text: "Spike3bBarcode12" has_text: - text: "3,209,286,105 bp" + text: "586,300,787 bp" coverage_across_reference: asserts: has_text: text: "#Position (bp)" has_n_lines: - value: 100 + value: 416 coverage_histogram: asserts: has_text: text: "Number of genomic locations" has_n_lines: - value: 4 + value: 6 genome_fraction_coverage: asserts: has_text: @@ -122,7 +122,7 @@ has_text: text: "#Duplication rate" has_text: - text: "119.0" + text: "8.0" homopolymer_indels: asserts: has_text: @@ -142,7 +142,7 @@ has_text: text: "#Read position (bp)" has_text: - text: "2.273913" + text: "0.03930972" mapped_reads_gc-content_distribution: asserts: has_text: @@ -152,19 +152,19 @@ mapped_reads_nucleotide_content: asserts: has_text: - text: "16.666666" + text: "16.0" mapping_quality_across_reference: asserts: has_text: text: "#Position (bp)" has_n_lines: - value: 854 + value: 416 mapping_quality_histogram: asserts: has_text: text: "#Mapping quality" has_n_lines: - value: 37 + value: 4 MultiQC HTML Report: asserts: has_text: