Merge pull request #219 from databio/dev

donaldcampbelljr · web-flow · commit 2682369c4418 · 2024-04-19T09:54:58.000-04:00
v0.14.1 release
diff --git a/.gitignore b/.gitignore
@@ -86,3 +86,6 @@ piper.egg-info/
 
 *ipynb_checkpoints*
 *.egg-info*
+
+
+example_pipelines/pipeline_output
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 [![Documentation Status](https://readthedocs.org/projects/pypiper/badge/?version=latest)](http://pypiper.readthedocs.org/en/latest/?badge=latest)
 [![Build Status](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml/badge.svg)](https://github.com/databio/pypiper/actions/workflows/run-pytest.yml)
-[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io)
+[![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io)
 [![pypi-badge](https://img.shields.io/pypi/v/piper)](https://pypi.org/project/piper)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## [0.14.1] -- 2024-04-19
+### Changed
+- remove pipestat_project_name from PipelineManager parameters
+- refactor pipestat_sample_name to pipestat_record_identifier in PipelineManager parameters
+- update requirements for pipestat 0.9.0, ubiquerg 0.8.0, and yacman 0.9.3
+- set `force_overwrite` to default to true, Issue #209
+
+
 ## [0.14.0] -- 2023-12-22
 ### Changed
 - refactor for pipestat v0.6.0 release
diff --git a/example_pipelines/basic.py b/example_pipelines/basic.py
@@ -26,15 +26,15 @@
 tgt = "pipeline_output/test.out"
 
 # build the command
-cmd = "shuf -i 1-500000000 -n 10000000 > " + tgt
+cmd = f"shuf -i 1-500000000 -n 10000000 > {tgt}"
 
 # and run with run().
 pm.run(cmd, target=tgt)
 
 # Now copy the data into a new file.
 # first specify target file and build command:
 tgt = "pipeline_output/copied.out"
-cmd = "cp pipeline_output/test.out " + tgt
+cmd = f"cp pipeline_output/test.out {tgt}"
 pm.run(cmd, target=tgt)
 
 # You can also string multiple commands together, which will execute
diff --git a/example_pipelines/hello_pypiper.py b/example_pipelines/hello_pypiper.py
@@ -12,7 +12,7 @@
 
 # Now build a command-line command however you like, and pass it to pm.run()
 target_file = "hello_pypiper_results/output.txt"
-cmd = "echo 'Hello, Pypiper!' > " + target_file
+cmd = f"echo 'Hello, Pypiper!' > {target_file}"
 pm.run(cmd, target_file)
 
 pm.stop_pipeline()
diff --git a/pypiper/_version.py b/pypiper/_version.py
@@ -1 +1 @@
-__version__ = "0.14.0"
+__version__ = "0.14.1"
diff --git a/pypiper/const.py b/pypiper/const.py
@@ -1,6 +1,5 @@
 """ Pypiper constants. """
 
-
 CHECKPOINT_EXTENSION = ".checkpoint"
 DEFAULT_SAMPLE_NAME = "DEFAULT_SAMPLE_NAME"
 PIPELINE_CHECKPOINT_DELIMITER = "_"
diff --git a/pypiper/manager.py b/pypiper/manager.py
@@ -52,12 +52,13 @@
     default_pipestat_output_schema,
     result_formatter_markdown,
 )
-from pipestat.helpers import read_yaml_data
+from yacman import load_yaml
 
 __all__ = ["PipelineManager"]
 
 
 LOCK_PREFIX = "lock."
+LOGFILE_SUFFIX = "_log.md"
 
 
 class Unbuffered(object):
@@ -112,6 +113,12 @@ class PipelineManager(object):
         protect from a case in which a restart begins upstream of a stage
         for which a checkpoint file already exists, but that depends on the
         upstream stage and thus should be rerun if it's "parent" is rerun.
+    :param str pipestat_record_identifier: record_identifier to report results via pipestat
+    :param str pipestat_schema: output schema used by pipestat to report results
+    :param str pipestat_results_file: path to file backend for reporting results
+    :param str pipestat_config_file: path to pipestat configuration file
+    :param str pipestat_pipeline_type: Sample or Project level pipeline
+    :param pipestat_result_formatter: function used to style reported results, defaults to result_formatter_markdown
     :raise TypeError: if start or stop point(s) are provided both directly and
         via args namespace, or if both stopping types (exclusive/prospective
         and inclusive/retrospective) are provided.
@@ -136,8 +143,7 @@ def __init__(
         output_parent=None,
         overwrite_checkpoints=False,
         logger_kwargs=None,
-        pipestat_project_name=None,
-        pipestat_sample_name=None,
+        pipestat_record_identifier=None,
         pipestat_schema=None,
         pipestat_results_file=None,
         pipestat_config=None,
@@ -193,10 +199,7 @@ def __init__(
         # If no starting point was specified, assume that the pipeline's
         # execution is to begin right away and set the internal flag so that
         # run() is let loose to execute instructions given.
-        if not self.start_point:
-            self._active = True
-        else:
-            self._active = False
+        self._active = not self.start_point
 
         # Pipeline-level variables to track global state and pipeline stats
         # Pipeline settings
@@ -210,26 +213,37 @@ def __init__(
         self.output_parent = params["output_parent"]
         self.testmode = params["testmode"]
 
+        # Establish the log file to check safety with logging keyword arguments.
+        # Establish the output folder since it's required for the log file.
+        self.outfolder = os.path.join(outfolder, "")  # trailing slash
+        self.pipeline_log_file = pipeline_filepath(self, suffix=LOGFILE_SUFFIX)
+
         # Set up logger
         logger_kwargs = logger_kwargs or {}
+        if logger_kwargs.get("logfile") == self.pipeline_log_file:
+            raise ValueError(
+                f"The logfile given for the pipeline manager's logger matches that which will be used by the manager itself: {self.pipeline_log_file}"
+            )
         default_logname = ".".join([__name__, self.__class__.__name__, self.name])
-        if not args:
+        self._logger = None
+        if args:
+            logger_builder_method = "logger_via_cli"
+            try:
+                self._logger = logger_via_cli(args, **logger_kwargs)
+            except logmuse.est.AbsentOptionException as e:
+                # Defer logger construction to init_logger.
+                self.debug(f"logger_via_cli failed: {e}")
+        if self._logger is None:
+            logger_builder_method = "init_logger"
+            # covers cases of bool(args) being False, or failure of logger_via_cli.
             # strict is only for logger_via_cli.
-            kwds = {k: v for k, v in logger_kwargs.items() if k != "strict"}
+            logger_kwargs = {k: v for k, v in logger_kwargs.items() if k != "strict"}
             try:
-                name = kwds.pop("name")
+                name = logger_kwargs.pop("name")
             except KeyError:
                 name = default_logname
-            self._logger = logmuse.init_logger(name, **kwds)
-            self.debug("Logger set with logmuse.init_logger")
-        else:
-            logger_kwargs.setdefault("name", default_logname)
-            try:
-                self._logger = logmuse.logger_via_cli(args)
-                self.debug("Logger set with logmuse.logger_via_cli")
-            except logmuse.est.AbsentOptionException:
-                self._logger = logmuse.init_logger("pypiper", level="DEBUG")
-                self.debug("logger_via_cli failed; Logger set with logmuse.init_logger")
+            self._logger = logmuse.init_logger(name, **logger_kwargs)
+        self.debug(f"Logger set with {logger_builder_method}")
 
         # Keep track of an ID for the number of processes attempted
         self.proc_count = 0
@@ -276,10 +290,7 @@ def __init__(
         #   self.output_parent = os.path.join(os.getcwd(), self.output_parent)
 
         # File paths:
-        self.outfolder = os.path.join(outfolder, "")  # trailing slash
         self.make_sure_path_exists(self.outfolder)
-        self.pipeline_log_file = pipeline_filepath(self, suffix="_log.md")
-
         self.pipeline_profile_file = pipeline_filepath(self, suffix="_profile.tsv")
 
         # Stats and figures are general and so lack the pipeline name.
@@ -330,7 +341,9 @@ def __init__(
         signal.signal(signal.SIGTERM, self._signal_term_handler)
 
         # pipestat setup
-        self.pipestat_record_identifier = pipestat_sample_name or DEFAULT_SAMPLE_NAME
+        self.pipestat_record_identifier = (
+            pipestat_record_identifier or DEFAULT_SAMPLE_NAME
+        )
         self.pipestat_pipeline_type = pipestat_pipeline_type or "sample"
 
         # don't force default pipestat_results_file value unless
@@ -631,88 +644,41 @@ def start_pipeline(self, args=None, multi=False):
         # Print out a header section in the pipeline log:
         # Wrap things in backticks to prevent markdown from interpreting underscores as emphasis.
         # print("----------------------------------------")
-        self.info("### Pipeline run code and environment:\n")
-        self.info(
-            "* " + "Command".rjust(20) + ":  " + "`" + str(" ".join(sys.argv)) + "`"
-        )
-        self.info("* " + "Compute host".rjust(20) + ":  " + platform.node())
-        self.info("* " + "Working dir".rjust(20) + ":  " + os.getcwd())
-        self.info("* " + "Outfolder".rjust(20) + ":  " + self.outfolder)
+        def logfmt(key, value=None, padding=16):
+            padded_key = key.rjust(padding)
+            formatted_val = f"`{value}`" if value else ""
+            return f"* {padded_key}: {formatted_val}"
 
-        self.timestamp("* " + "Pipeline started at".rjust(20) + ":  ")
+        self.info("### Pipeline run code and environment:\n")
+        self.info(logfmt("Command", str(" ".join(sys.argv))))
+        self.info(logfmt("Compute host", platform.node()))
+        self.info(logfmt("Working dir", os.getcwd()))
+        self.info(logfmt("Outfolder", self.outfolder))
+        self.info(logfmt("Log file", self.pipeline_log_file))
+        self.timestamp(logfmt("Start time"))
 
         self.info("\n### Version log:\n")
-        self.info("* " + "Python version".rjust(20) + ":  " + platform.python_version())
+        self.info(logfmt("Python version", platform.python_version()))
         try:
-            self.info(
-                "* "
-                + "Pypiper dir".rjust(20)
-                + ":  "
-                + "`"
-                + gitvars["pypiper_dir"].strip()
-                + "`"
-            )
-            self.info("* " + "Pypiper version".rjust(20) + ":  " + __version__)
-            self.info(
-                "* " + "Pypiper hash".rjust(20) + ":  " + str(gitvars["pypiper_hash"])
-            )
-            self.info(
-                "* "
-                + "Pypiper branch".rjust(20)
-                + ":  "
-                + str(gitvars["pypiper_branch"])
-            )
-            self.info(
-                "* " + "Pypiper date".rjust(20) + ":  " + str(gitvars["pypiper_date"])
-            )
+            self.info(logfmt("Pypiper dir", gitvars["pypiper_dir"].strip()))
+            self.info(logfmt("Pypiper version", __version__))
+            self.info(logfmt("Pypiper hash", gitvars["pypiper_hash"]))
+            self.info(logfmt("Pypiper branch", gitvars["pypiper_branch"]))
+            self.info(logfmt("Pypiper date", gitvars["pypiper_date"]))
             if gitvars["pypiper_diff"]:
-                self.info(
-                    "* "
-                    + "Pypiper diff".rjust(20)
-                    + ":  "
-                    + str(gitvars["pypiper_diff"])
-                )
+                self.info(logfmt("Pypiper diff", gitvars["pypiper_diff"]))
         except KeyError:
             # It is ok if keys aren't set, it means pypiper isn't in a  git repo.
             pass
 
         try:
-            self.info(
-                "* "
-                + "Pipeline dir".rjust(20)
-                + ":  "
-                + "`"
-                + gitvars["pipe_dir"].strip()
-                + "`"
-            )
-            self.info(
-                "* " + "Pipeline version".rjust(20) + ":  " + str(self.pl_version)
-            )
-            self.info(
-                "* "
-                + "Pipeline hash".rjust(20)
-                + ":  "
-                + str(gitvars["pipe_hash"]).strip()
-            )
-            self.info(
-                "* "
-                + "Pipeline branch".rjust(20)
-                + ":  "
-                + str(gitvars["pipe_branch"]).strip()
-            )
-            self.info(
-                "* "
-                + "Pipeline date".rjust(20)
-                + ":  "
-                + str(gitvars["pipe_date"]).strip()
-            )
+            self.info(logfmt("Pipeline dir", gitvars["pipe_dir"].strip()))
+            self.info(logfmt("Pipeline version", self.pl_version))
+            self.info(logfmt("Pipeline hash", gitvars["pipe_hash"]).strip())
+            self.info(logfmt("Pipeline branch", gitvars["pipe_branch"]).strip())
+            self.info(logfmt("Pipeline date", gitvars["pipe_date"]).strip())
             if gitvars["pipe_diff"] != "":
-                self.info(
-                    "* "
-                    + "Pipeline diff".rjust(20)
-                    + ":  "
-                    + str(gitvars["pipe_diff"]).strip()
-                )
+                self.info(logfmt("Pipeline diff", gitvars["pipe_diff"]).strip())
         except KeyError:
             # It is ok if keys aren't set, it means the pipeline isn't a git repo.
             pass
@@ -1593,7 +1559,7 @@ def _report_profile(
             myfile.write(message_raw + "\n")
 
     def report_result(
-        self, key, value, nolog=False, result_formatter=None, force_overwrite=False
+        self, key, value, nolog=False, result_formatter=None, force_overwrite=True
     ):
         """
         Writes a key:value pair to self.pipeline_stats_file.
@@ -1640,7 +1606,7 @@ def report_object(
         annotation=None,
         nolog=False,
         result_formatter=None,
-        force_overwrite=False,
+        force_overwrite=True,
     ):
         """
         Writes a key:value pair to self.pipeline_stats_file. Note: this function
@@ -1862,7 +1828,7 @@ def _refresh_stats(self):
         """
 
         if os.path.isfile(self.pipeline_stats_file):
-            _, data = read_yaml_data(path=self.pipeline_stats_file, what="stats_file")
+            data = load_yaml(filepath=self.pipeline_stats_file)
 
             for key, value in data[self._pipestat_manager.pipeline_name][
                 self._pipestat_manager.pipeline_type
diff --git a/pypiper/ngstk.py b/pypiper/ngstk.py
@@ -153,8 +153,7 @@ def get_file_size(self, filenames):
             return sum([self.get_file_size(filename) for filename in filenames])
 
         return round(
-            sum([float(os.stat(f).st_size) for f in filenames.split(" ")])
-            / (1024**2),
+            sum([float(os.stat(f).st_size) for f in filenames.split(" ")]) / (1024**2),
             4,
         )
 
diff --git a/pypiper/utils.py b/pypiper/utils.py
@@ -785,12 +785,10 @@ def pipeline_filepath(pm, filename=None, suffix=None):
         filename as given or determined by the pipeline name, and suffix
         appended if given.
     """
-
     if filename is None and suffix is None:
         raise TypeError(
-            "Provide filename and/or suffix to create " "path to a pipeline file."
+            "Provide filename and/or suffix to create path to a pipeline file."
         )
-
     filename = (filename or pm.name) + (suffix or "")
 
     # Note that Pipeline and PipelineManager define the same outfolder.
diff --git a/requirements/requirements-docs.txt b/requirements/requirements-docs.txt
@@ -2,5 +2,5 @@ mkdocs>=1.0
 markdown-include
 pydoc-markdown
 piper
-pipestat>=0.6.0
+pipestat>=0.9.0a1
 https://github.com/databio/mkdocs-databio/archive/master.zip
diff --git a/requirements/requirements-pypiper.txt b/requirements/requirements-pypiper.txt
@@ -1,6 +1,6 @@
 logmuse>=0.2.4
 psutil
 pandas
-ubiquerg>=0.4.5
-yacman
-pipestat>=0.6.0
+ubiquerg>=0.8.0
+yacman>=0.9.3
+pipestat>=0.9.0a1
diff --git a/tests/pipeline_manager/test_manager_constructor.py b/tests/pipeline_manager/test_manager_constructor.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.14.0"`
	`1`	`+__version__ = "0.14.1"`
Original file line number	Diff line number	Diff line change
`@@ -153,8 +153,7 @@ def get_file_size(self, filenames):`
`153`	`153`	`return sum([self.get_file_size(filename) for filename in filenames])`
`154`	`154`
`155`	`155`	`return round(`
`156`		`- sum([float(os.stat(f).st_size) for f in filenames.split(" ")])`
`157`		`- / (1024**2),`
	`156`	`+ sum([float(os.stat(f).st_size) for f in filenames.split(" ")]) / (1024**2),`
`158`	`157`	`4,`
`159`	`158`	`)`
`160`	`159`