From d298a0a750bbe20ef0a91df167513b1661e71a71 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Fri, 12 Sep 2025 16:21:41 +0100 Subject: [PATCH 01/15] Edits following first set of review comments. --- improver/calibration/__init__.py | 80 ++++ ...train_quantile_regression_random_forest.py | 238 ++++----- improver/cli/__init__.py | 6 +- ...train_quantile_regression_random_forest.py | 452 +++++++++++++----- 4 files changed, 527 insertions(+), 249 deletions(-) diff --git a/improver/calibration/__init__.py b/improver/calibration/__init__.py index 08e432adcc..5b5dc4bbf3 100644 --- a/improver/calibration/__init__.py +++ b/improver/calibration/__init__.py @@ -7,8 +7,12 @@ """ from collections import OrderedDict +from pathlib import Path from typing import Dict, List, Optional, Tuple, Union +import iris +import joblib +import pyarrow.parquet as pq from iris.cube import Cube, CubeList from improver.metadata.probabilistic import ( @@ -263,6 +267,82 @@ def split_forecasts_and_bias_files(cubes: CubeList) -> Tuple[Cube, Optional[Cube return forecast_cube, bias_cubes +def split_pickle_parquet_and_netcdf(files): + """Split the input files into pickle, parquet, and netcdf files. + Only a single pickle file is expected. + Args: + files: + A list of input file paths which will be split into pickle, + parquet, and netcdf files. + Returns: + - A flattened cube list containing all the cubes contained within the + provided paths to NetCDF files. + - A list of paths to Parquet files. + - A loaded pickle file. + Raises: + ValueError: If multiple pickle files provided, as only one is ever expected. + """ + cubes = iris.cube.CubeList() + loaded_pickles = [] + parquets = [] + + for file_path in files: + if not file_path.exists(): + continue + + # Directories indicate we are working with parquet files. + if file_path.is_dir(): + parquets.append(file_path) + continue + + try: + cube = iris.load(file_path) + cubes.extend(cube) + except ValueError: + try: + loaded_pickles.append(joblib.load(file_path)) + except Exception as e: + msg = f"Failed to load {file_path}: {e}" + raise ValueError(msg) + + if len(loaded_pickles) > 1: + msg = "Multiple pickle inputs have been provided. Only one is expected." + raise ValueError(msg) + + return ( + cubes if cubes else None, + parquets if parquets else None, + loaded_pickles[0] if loaded_pickles else None, + ) + + +def identify_parquet_type(parquet_paths: List[Path]): + """Determine whether the provided parquet paths contain forecast or truth data. + This is done by checking the columns within the parquet files for the presence + of a forecast_period column which is only present for forecast data. + Args: + parquet_paths: + A list of paths to Parquet files. + Returns: + - The path to the Parquet file containing the historical forecasts. + - The path to the Parquet file containing the truths. + """ + forecast_table_path = None + truth_table_path = None + for file_path in parquet_paths: + try: + example_file_path = next(file_path.glob("**/*.parquet")) + except StopIteration: + continue + try: + pq.read_schema(example_file_path).field("forecast_period") + forecast_table_path = file_path + except KeyError: + truth_table_path = file_path + + return forecast_table_path, truth_table_path + + def validity_time_check(forecast: Cube, validity_times: List[str]) -> bool: """Check the validity time of the forecast matches the accepted validity times within the validity times list. diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 421ca110cb..5d06201d7a 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -12,21 +12,27 @@ import iris import numpy as np import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq from iris.pandas import as_data_frame from improver import PostProcessingPlugin -from improver.calibration import CalibrationSchemas +from improver.calibration import ( + CalibrationSchemas, + identify_parquet_type, + split_pickle_parquet_and_netcdf, +) from improver.calibration.quantile_regression_random_forest import ( TrainQuantileRegressionRandomForests, quantile_forest_package_available, ) -from improver.utilities.load import load_cube iris.FUTURE.pandas_ndim = True -class LoadAndTrainQRF(PostProcessingPlugin): - """Plugin to load and train a Quantile Regression Random Forest (QRF) model.""" +class LoadForQRF(PostProcessingPlugin): + """Plugin to load input files for training a Quantile Regression Random Forest + (QRF) model.""" def __init__( self, @@ -37,14 +43,9 @@ def __init__( cycletime: str, training_length: int, experiment: Optional[str] = None, - n_estimators: int = 100, - max_depth: Optional[int] = None, - max_samples: Optional[float] = None, - random_state: Optional[int] = None, - transformation: Optional[str] = None, - pre_transform_addition: float = 0, ): - """Initialise the LoadAndTrainQRF plugin.""" + """Initialise the LoadForQRF plugin.""" + self.quantile_forest_installed = quantile_forest_package_available() self.feature_config = feature_config self.target_diagnostic_name = target_diagnostic_name self.target_cf_name = target_cf_name @@ -52,73 +53,31 @@ def __init__( self.cycletime = cycletime self.training_length = training_length self.experiment = experiment - self.n_estimators = n_estimators - self.max_depth = max_depth - self.max_samples = max_samples - self.random_state = random_state - self.transformation = transformation - self.pre_transform_addition = pre_transform_addition - self.quantile_forest_installed = quantile_forest_package_available() - def _split_cubes_and_parquet_files( - self, file_paths: list[pathlib.Path | str] - ) -> tuple[Optional[pathlib.Path], Optional[pathlib.Path], iris.cube.CubeList]: - """Split the input file paths into cubes and parquet files. - - Args: - file_paths: List of file paths. + def _parse_forecast_periods(self) -> list[int]: + """Parse the forecast periods argument to produce a list of forecast periods + in seconds. Returns: - Tuple containing the items below if found: - - Path to the forecast parquet file. - - Path to the truth parquet file. - - List of cubes loaded from the NetCDF files. - + List of forecast periods in seconds. Raises: - ValueError: If the number of cubes loaded does not match the number of - features expected. + ValueError: If the forecast_periods argument is not a single integer or + a range in the form 'start:end:interval'. """ - import pyarrow.parquet as pq - - forecast_table_path = None - truth_table_path = None - cube_inputs = iris.cube.CubeList([]) - - # file extraction loop: - for file_path in file_paths: - if not Path(file_path).exists(): - # This will occur when the filepath does not exist. In this case, - # return None. - return None, None, None - elif Path(file_path).is_dir(): - try: - example_file_path = next(Path(file_path).glob("**/*.parquet")) - except StopIteration: - # If no parquet files are found, return None. - return None, None, None - try: - pq.read_schema(example_file_path).field("forecast_period") - forecast_table_path = file_path - except KeyError: - truth_table_path = file_path - else: - cube = load_cube(str(file_path)) - cube_inputs.append(cube) - - if len(self.feature_config.keys()) not in [ - len(cube_inputs), - len(cube_inputs) + 1, - ]: - msg = ( - "The number of cubes loaded does not match the number of features " - "expected. These can mismatch if some features are coming from the " - "historic forecast table. The number of cubes loaded was: " - f"{len(cube_inputs)}. The number of features expected was: " - f"{len(self.feature_config.keys())}." - ) - raise ValueError(msg) - - return forecast_table_path, truth_table_path, cube_inputs + if ":" in self.forecast_periods: + forecast_periods = list(range(*map(int, self.forecast_periods.split(":")))) + forecast_periods = [fp * 3600 for fp in forecast_periods] + else: + try: + forecast_periods = [int(self.forecast_periods) * 3600] + except ValueError: + msg = ( + "The forecast_periods argument must be a single integer or " + "a range in the form 'start:end:interval'. The forecast period" + f"provided was: {self.forecast_periods}." + ) + raise ValueError(msg) + return forecast_periods def _read_parquet_files( self, @@ -127,7 +86,7 @@ def _read_parquet_files( forecast_periods: list[int], ) -> tuple[pd.DataFrame, pd.DataFrame]: """Read the forecast and truth data from parquet files. - + self.quantile_forest_installed = quantile_forest_package_available() Args: forecast_table_path: Path to the forecast parquet file. truth_table_path: Path to the truth parquet file. @@ -144,9 +103,6 @@ def _read_parquet_files( ValueError: If the truth parquet file does not contain the expected fields. """ - import pyarrow as pa - import pyarrow.parquet as pq - cycletimes = [] for forecast_period in forecast_periods: # Load forecasts from parquet file filtering by diagnostic and blend_time. @@ -195,14 +151,17 @@ def _read_parquet_files( forecast_df = forecast_df[ forecast_df["forecast_period"].isin(np.array(forecast_periods) * 1e9) - ] + ].reset_index(drop=True) + # Convert df columns from ns to pandas timestamp object. for column in ["time", "forecast_reference_time", "blend_time"]: forecast_df[column] = pd.to_datetime( forecast_df[column], unit="ns", utc=True ) + forecast_df[column] = forecast_df[column].astype("datetime64[ns, UTC]") for column in ["forecast_period", "period"]: forecast_df[column] = pd.to_timedelta(forecast_df[column], unit="ns") + forecast_df[column] = forecast_df[column].astype("timedelta64[ns]") forecast_df = forecast_df.rename(columns={"forecast": self.target_cf_name}) @@ -216,6 +175,7 @@ def _read_parquet_files( ) truth_df["time"] = pd.to_datetime(truth_df["time"], unit="ns", utc=True) + truth_df["time"] = truth_df["time"].astype("datetime64[ns, UTC]") if truth_df.empty: msg = ( @@ -225,6 +185,82 @@ def _read_parquet_files( raise IOError(msg) return forecast_df, truth_df + def process( + self, + file_paths: list[pathlib.Path | str], + ) -> Optional[tuple[iris.cube.CubeList, pathlib.Path | str, pathlib.Path | str]]: + """Load input files for training a Quantile Regression Random Forest (QRF) + model. Two sources of input data must be provided: historical forecasts and + historical truth data (to use in calibration). + + Args: + file_paths (cli.inputpaths): + A list of input paths containing: + - The path to a Parquet file containing the truths to be used + for calibration. The expected columns within the + Parquet file are: ob_value, time, wmo_id, diagnostic, latitude, + longitude and altitude. + - The path to a Parquet file containing the forecasts to be used + for calibration. + - Optionally, paths to NetCDF files containing additional predictors. + """ + if not self.quantile_forest_installed: + return None + cube_inputs, parquets, _ = split_pickle_parquet_and_netcdf(file_paths) + + forecast_table_path, truth_table_path = identify_parquet_type(parquets) + + if not forecast_table_path or not truth_table_path: + return None + + forecast_periods = self._parse_forecast_periods() + forecast_df, truth_df = self._read_parquet_files( + forecast_table_path, truth_table_path, forecast_periods + ) + + if cube_inputs is None: + cube_inputs = iris.cube.CubeList() + missing_features = ( + set(self.feature_config.keys()) + - set(forecast_df.columns) + - set([c.name() for c in cube_inputs]) + ) + + if len(missing_features) > 0: + msg = ( + "The features requested in the feature_config are absent from " + "the forecast parquet file and the input cubes. The missing fields are: " + f"{missing_features}." + ) + raise ValueError(msg) + return forecast_df, truth_df, cube_inputs + + +class PrepareAndTrainQRF(PostProcessingPlugin): + """Plugin to prepare and train a Quantile Regression Random Forest (QRF) model.""" + + def __init__( + self, + feature_config: dict[str, list[str]], + target_cf_name: str, + n_estimators: int = 100, + max_depth: Optional[int] = None, + max_samples: Optional[float] = None, + random_state: Optional[int] = None, + transformation: Optional[str] = None, + pre_transform_addition: float = 0, + ): + """Initialise the PrepareAndTrainQRF plugin.""" + self.feature_config = feature_config + self.target_cf_name = target_cf_name + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_samples = max_samples + self.random_state = random_state + self.transformation = transformation + self.pre_transform_addition = pre_transform_addition + self.quantile_forest_installed = quantile_forest_package_available() + @staticmethod def _check_matching_times( forecast_df: pd.DataFrame, truth_df: pd.DataFrame @@ -298,8 +334,9 @@ def filter_bad_sites( def process( self, - file_paths: list[pathlib.Path | str], - model_output: str = None, + forecast_df: pd.DataFrame, + truth_df: pd.DataFrame, + cube_inputs: Optional[iris.cube.CubeList] = None, ) -> None: """Load input files and training a Quantile Regression Random Forest (QRF) model. This model can be applied later to calibrate the forecast. Two sources @@ -307,45 +344,17 @@ def process( (to use in calibration). The model is output as a pickle file. Args: - file_paths (cli.inputpaths): - A list of input paths containing: - - The path to a Parquet file containing the truths to be used - for calibration. The expected columns within the - Parquet file are: ob_value, time, wmo_id, diagnostic, latitude, - longitude and altitude. - - The path to a Parquet file containing the forecasts to be used - for calibration. - - Optionally, paths to NetCDF files containing additional predictors. - model_output (str): - Full path including model file name that will store the pickled model. + forecast_df: DataFrame containing the forecast data. + truth_df: DataFrame containing the truth data. + cube_inputs: List of cubes containing additional features. + Raises: + ValueError: If the number of cubes loaded does not match the number of + features expected. """ if not self.quantile_forest_installed: return None - forecast_table_path, truth_table_path, cube_inputs = ( - self._split_cubes_and_parquet_files(file_paths) - ) - - if not forecast_table_path or not truth_table_path: - return None - if ":" in self.forecast_periods: - forecast_periods = list(range(*map(int, self.forecast_periods.split(":")))) - forecast_periods = [fp * 3600 for fp in forecast_periods] - else: - try: - forecast_periods = [int(self.forecast_periods) * 3600] - except ValueError: - msg = ( - "The forecast_periods argument must be a single integer or " - "a range in the form 'start:end:interval'. The forecast period" - f"provided was: {self.forecast_periods}." - ) - raise ValueError(msg) - - forecast_df, truth_df = self._read_parquet_files( - forecast_table_path, truth_table_path, forecast_periods - ) intersecting_times = self._check_matching_times(forecast_df, truth_df) if len(intersecting_times) == 0: return None @@ -362,6 +371,5 @@ def process( random_state=self.random_state, transformation=self.transformation, pre_transform_addition=self.pre_transform_addition, - model_output=model_output, )(forecast_df, truth_df) return result diff --git a/improver/cli/__init__.py b/improver/cli/__init__.py index 7a6be5a5c6..7ee8ba5b67 100644 --- a/improver/cli/__init__.py +++ b/improver/cli/__init__.py @@ -348,17 +348,19 @@ def with_output( Result of calling `wrapped` or None if `output` is given. """ import joblib + from iris.cube import Cube, CubeList from improver.utilities.save import save_netcdf result = wrapped(*args, **kwargs) - if output and output.endswith(".nc"): + if output and (isinstance(result, Cube) or isinstance(result, CubeList)): save_netcdf(result, output, compression_level, least_significant_digit) if pass_through_output: return ObjectAsStr(result, output) return - elif output and output.endswith((".pickle", ".pkl")): + elif output and result: + # If output is set and result exists but is not a Cube, save it as a pickle file joblib.dump(result, output, compress=compression_level) return return result diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py index 396a0f6bbe..3108c265fb 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py @@ -2,7 +2,7 @@ # # This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. # See LICENSE in the root of the repository for full licensing details. -"""Unit tests for the LoadAndTrainQRF plugin.""" +"""Unit tests for the LoadForQRF and PrepareAndTrain QRF plugins.""" import iris import numpy as np @@ -10,7 +10,8 @@ import pytest from improver.calibration.load_and_train_quantile_regression_random_forest import ( - LoadAndTrainQRF, + LoadForQRF, + PrepareAndTrainQRF, ) from improver.synthetic_data.set_up_test_cubes import set_up_spot_variable_cube @@ -24,24 +25,30 @@ def _create_multi_site_forecast_parquet_file(tmp_path, representation="percentile"): - """Create a parquet file with multi-site forecast data.""" + """Create a parquet file with multi-site forecast data. + + Args: + tmp_path: Temporary path to save the parquet file. + representation: The type of ensemble representation to use. Options are + "percentile" or "realization". + """ data_dict = { - "percentile": np.repeat(50, 5), - "forecast": [281, 272, 287, 280, 290], - "altitude": [10, 83, 56, 23, 2], - "blend_time": [pd.Timestamp("2017-01-02 00:00:00")] * 5, + "percentile": np.repeat(50.0, 5), + "forecast": [281.0, 272.0, 287.0, 280.0, 290.0], + "altitude": [10.0, 83.0, 56.0, 23.0, 2.0], + "blend_time": [pd.Timestamp("2017-01-02 00:00:00", tz="utc")] * 5, "forecast_period": [6 * 3.6e12] * 5, - "forecast_reference_time": [pd.Timestamp("2017-01-02 00:00:00")] * 5, + "forecast_reference_time": [pd.Timestamp("2017-01-02 00:00:00", tz="utc")] * 5, "latitude": [60.1, 59.9, 59.7, 58, 57], - "longitude": [1, 2, -1, -2, -3], - "time": [pd.Timestamp("2017-01-02 06:00:00")] * 5, + "longitude": [1.0, 2.0, -1.0, -2.0, -3.0], + "time": [pd.Timestamp("2017-01-02 06:00:00", tz="utc")] * 5, "wmo_id": ["03001", "03002", "03003", "03004", "03005"], "station_id": ["03001", "03002", "03003", "03004", "03005"], "cf_name": ["air_temperature"] * 5, "units": ["K"] * 5, "experiment": ["latestblend"] * 5, - "period": [pd.NaT] * 5, + "period": [pd.NA] * 5, "height": [1.5] * 5, "diagnostic": ["temperature_at_screen_level"] * 5, } @@ -64,9 +71,10 @@ def _create_multi_site_forecast_parquet_file(tmp_path, representation="percentil output_dir = tmp_path / "forecast_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) - output_path = str(output_dir / "forecast.parquet") + output_path = output_dir / "forecast.parquet" joined_df.to_parquet(output_path, index=False, engine="pyarrow") - return output_dir, data_dict["wmo_id"] + + return output_dir, joined_df, data_dict["wmo_id"] def _create_multi_percentile_forecast_parquet_file(tmp_path, representation=None): @@ -74,20 +82,20 @@ def _create_multi_percentile_forecast_parquet_file(tmp_path, representation=None data_dict = { "percentile": [16 + 2 / 3, 33 + 1 / 3, 50, 66 + 2 / 3, 83 + 1 / 3], - "forecast": [272, 274, 275, 277, 280], - "altitude": [10, 10, 10, 10, 10], - "blend_time": [pd.Timestamp("2017-01-02 00:00:00")] * 5, + "forecast": [272.0, 274.0, 275.0, 277.0, 280.0], + "altitude": [10.0, 10.0, 10.0, 10.0, 10.0], + "blend_time": [pd.Timestamp("2017-01-02 00:00:00", tz="utc")] * 5, "forecast_period": [6 * 3.6e12] * 5, - "forecast_reference_time": [pd.Timestamp("2017-01-02 00:00:00")] * 5, + "forecast_reference_time": [pd.Timestamp("2017-01-02 00:00:00", tz="utc")] * 5, "latitude": [60.1, 60.1, 60.1, 60.1, 60.1], "longitude": [1, 1, 1, 1, 1], - "time": [pd.Timestamp("2017-01-02 06:00:00")] * 5, + "time": [pd.Timestamp("2017-01-02 06:00:00", tz="utc")] * 5, "wmo_id": ["03001", "03001", "03001", "03001", "03001"], "station_id": ["03001", "03001", "03001", "03001", "03001"], "cf_name": ["air_temperature"] * 5, "units": ["K"] * 5, "experiment": ["latestblend"] * 5, - "period": [pd.NaT] * 5, + "period": [pd.NA] * 5, "height": [1.5] * 5, "diagnostic": ["temperature_at_screen_level"] * 5, } @@ -103,18 +111,18 @@ def _create_multi_percentile_forecast_parquet_file(tmp_path, representation=None output_dir = tmp_path / "forecast_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) - output_path = str(output_dir / "forecast.parquet") + output_path = output_dir / "forecast.parquet" joined_df.to_parquet(output_path, index=False, engine="pyarrow") - return output_dir, data_dict["wmo_id"] + return output_dir, joined_df, data_dict["wmo_id"] def _create_multi_forecast_period_forecast_parquet_file(tmp_path, representation=None): """Create a parquet file with multi-forecast period forecast data.""" data_dict = { - "percentile": [50, 50, 50, 50], - "forecast": [277, 270, 280, 269], - "altitude": [10, 83, 10, 83], + "percentile": [50.0, 50.0, 50.0, 50.0], + "forecast": [277.0, 270.0, 280.0, 269.0], + "altitude": [10.0, 83.0, 10.0, 83.0], "blend_time": [pd.Timestamp("2017-01-02 00:00:00")] * 4, "forecast_period": np.repeat( [ @@ -135,7 +143,7 @@ def _create_multi_forecast_period_forecast_parquet_file(tmp_path, representation "cf_name": ["air_temperature"] * 4, "units": ["K"] * 4, "experiment": ["latestblend"] * 4, - "period": [pd.NaT] * 4, + "period": [pd.NA] * 4, "height": [1.5] * 4, "diagnostic": ["temperature_at_screen_level"] * 4, } @@ -151,9 +159,9 @@ def _create_multi_forecast_period_forecast_parquet_file(tmp_path, representation output_dir = tmp_path / "forecast_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) - output_path = str(output_dir / "forecast.parquet") + output_path = output_dir / "forecast.parquet" joined_df.to_parquet(output_path, index=False, engine="pyarrow") - return output_dir, data_dict["wmo_id"] + return output_dir, joined_df, data_dict["wmo_id"] def _create_multi_site_truth_parquet_file(tmp_path): @@ -161,11 +169,11 @@ def _create_multi_site_truth_parquet_file(tmp_path): data_dict = { "diagnostic": ["temperature_at_screen_level"] * 5, "latitude": [60.1, 59.9, 59.7, 58, 57], - "longitude": [1, 2, -1, -2, -3], - "altitude": [10, 83, 56, 23, 2], + "longitude": [1.0, 2.0, -1.0, -2.0, -3.0], + "altitude": [10.0, 83.0, 56.0, 23.0, 2.0], "time": [pd.Timestamp("2017-01-02 06:00:00")] * 5, "wmo_id": ["03001", "03002", "03003", "03004", "03005"], - "ob_value": [276, 270, 289, 290, 301], + "ob_value": [276.0, 270.0, 289.0, 290.0, 301.0], } wind_speed_dict = data_dict.copy() wind_speed_dict["ob_value"] = [3, 22, 24, 11, 9] @@ -176,9 +184,9 @@ def _create_multi_site_truth_parquet_file(tmp_path): output_dir = tmp_path / "truth_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) - output_path = str(output_dir / "truth.parquet") + output_path = output_dir / "truth.parquet" joined_df.to_parquet(output_path, index=False, engine="pyarrow") - return output_dir + return output_dir, joined_df def _create_multi_percentile_truth_parquet_file(tmp_path): @@ -186,11 +194,11 @@ def _create_multi_percentile_truth_parquet_file(tmp_path): data_dict = { "diagnostic": ["temperature_at_screen_level"], "latitude": [60.1], - "longitude": [1], - "altitude": [10], + "longitude": [1.0], + "altitude": [10.0], "time": [pd.Timestamp("2017-01-02 06:00:00")], "wmo_id": ["03001"], - "ob_value": [276], + "ob_value": [276.0], } wind_speed_dict = data_dict.copy() wind_speed_dict["ob_value"] = [9] @@ -201,9 +209,9 @@ def _create_multi_percentile_truth_parquet_file(tmp_path): output_dir = tmp_path / "truth_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) - output_path = str(output_dir / "truth.parquet") + output_path = output_dir / "truth.parquet" joined_df.to_parquet(output_path, index=False, engine="pyarrow") - return output_dir + return output_dir, joined_df def _create_multi_forecast_period_truth_parquet_file(tmp_path): @@ -211,8 +219,8 @@ def _create_multi_forecast_period_truth_parquet_file(tmp_path): data_dict = { "diagnostic": ["temperature_at_screen_level"] * 4, "latitude": [60.1, 59.9, 60.1, 59.9], - "longitude": [1, 2, 1, 2], - "altitude": [10, 83, 10, 83], + "longitude": [1.0, 2.0, 1.0, 2.0], + "altitude": [10.0, 83.0, 10.0, 83.0], "time": np.repeat( [pd.Timestamp("2017-01-02 06:00:00"), pd.Timestamp("2017-01-02 12:00:00")], 2, @@ -221,7 +229,7 @@ def _create_multi_forecast_period_truth_parquet_file(tmp_path): "ob_value": [280, 273, 284, 275], } wind_speed_dict = data_dict.copy() - wind_speed_dict["ob_value"] = [2, 11, 10, 14] + wind_speed_dict["ob_value"] = [2.0, 11.0, 10.0, 14.0] wind_speed_dict["diagnostic"] = "wind_speed_at_10m" data_df = pd.DataFrame(data_dict) wind_speed_df = pd.DataFrame(wind_speed_dict) @@ -229,9 +237,9 @@ def _create_multi_forecast_period_truth_parquet_file(tmp_path): output_dir = tmp_path / "truth_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) - output_path = str(output_dir / "truth.parquet") + output_path = output_dir / "truth.parquet" joined_df.to_parquet(output_path, index=False, engine="pyarrow") - return output_dir + return output_dir, joined_df def _create_multi_site_truth_parquet_file_alt(tmp_path): @@ -239,14 +247,14 @@ def _create_multi_site_truth_parquet_file_alt(tmp_path): data_dict = { "diagnostic": ["wind_speed_at_10m"] * 5, "latitude": [60.1, 59.9, 59.7, 58, 57], - "longitude": [1, 2, -1, -2, -3], - "altitude": [10, 83, 56, 23, 2], + "longitude": [1.0, 2.0, -1.0, -2.0, -3.0], + "altitude": [10.0, 83.0, 56.0, 23.0, 2.0], "time": [pd.Timestamp("2017-01-02 06:00:00")] * 5, "wmo_id": ["03001", "03002", "03003", "03004", "03005"], - "ob_value": [10, 25, 4, 3, 11], + "ob_value": [10.0, 25.0, 4.0, 3.0, 11.0], } wind_speed_dict = data_dict.copy() - wind_speed_dict["ob_value"] = [3, 22, 24, 11, 9] + wind_speed_dict["ob_value"] = [3.0, 22.0, 24.0, 11.0, 9.0] wind_speed_dict["diagnostic"] = "wind_speed_at_10m" data_df = pd.DataFrame(data_dict) wind_speed_df = pd.DataFrame(wind_speed_dict) @@ -254,9 +262,9 @@ def _create_multi_site_truth_parquet_file_alt(tmp_path): output_dir = tmp_path / "truth_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) - output_path = str(output_dir / "truth.parquet") + output_path = output_dir / "truth.parquet" joined_df.to_parquet(output_path, index=False, engine="pyarrow") - return output_dir + return output_dir, joined_df def _create_ancil_file(tmp_path, wmo_ids): @@ -277,13 +285,46 @@ def _create_ancil_file(tmp_path, wmo_ids): cube.remove_coord(coord) output_dir = tmp_path / "ancil_files" output_dir.mkdir(parents=True) - output_path = str(output_dir / "ancil.nc") - iris.save(cube, output_path) - return output_path + output_path = output_dir / "ancil.nc" + iris.save(cube, str(output_path)) + return output_path, cube + + +def filter_forecast_periods(forecast_df, forecast_periods): + """Filter the forecast DataFrame to only include the requested forecast periods.""" + if ":" in forecast_periods: + forecast_periods = [ + fp * 3600 for fp in range(*map(int, forecast_periods.split(":"))) + ] + else: + forecast_periods = [int(forecast_periods) * 3600] + return forecast_df[ + forecast_df["forecast_period"].isin(np.array(forecast_periods) * 1e9) + ].reset_index(drop=True) + + +def amend_expected_forecast_df( + forecast_df, forecast_periods, target_diagnostic_name, target_cf_name +): + forecast_df = filter_forecast_periods(forecast_df, forecast_periods) + forecast_df = forecast_df[forecast_df["diagnostic"] == target_diagnostic_name] + for column in ["time", "forecast_reference_time", "blend_time"]: + forecast_df[column] = pd.to_datetime(forecast_df[column], unit="ns", utc=True) + for column in ["forecast_period", "period"]: + forecast_df[column] = pd.to_timedelta(forecast_df[column], unit="ns") + + forecast_df = forecast_df.rename(columns={"forecast": target_cf_name}) + return forecast_df + + +def amend_expected_truth_df(truth_df, target_diagnostic_name): + truth_df = truth_df[truth_df["diagnostic"] == target_diagnostic_name] + truth_df["time"] = pd.to_datetime(truth_df["time"], unit="ns", utc=True) + return truth_df @pytest.mark.parametrize( - "forecast_creation,truth_creation,forecast_periods,include_static,remove_target,representation,expected", + "forecast_creation,truth_creation,forecast_periods,include_static,remove_target,representation", [ ( _create_multi_site_forecast_parquet_file, @@ -292,7 +333,6 @@ def _create_ancil_file(tmp_path, wmo_ids): False, False, "percentile", - 5.6, ), ( _create_multi_site_forecast_parquet_file, @@ -301,7 +341,6 @@ def _create_ancil_file(tmp_path, wmo_ids): True, False, "percentile", - 5.64, ), ( _create_multi_percentile_forecast_parquet_file, @@ -310,7 +349,6 @@ def _create_ancil_file(tmp_path, wmo_ids): False, False, "percentile", - 5.62, ), ( _create_multi_percentile_forecast_parquet_file, @@ -319,7 +357,6 @@ def _create_ancil_file(tmp_path, wmo_ids): True, False, "percentile", - 5.62, ), ( _create_multi_forecast_period_forecast_parquet_file, @@ -328,7 +365,6 @@ def _create_ancil_file(tmp_path, wmo_ids): False, False, "percentile", - 5.61, ), ( _create_multi_forecast_period_forecast_parquet_file, @@ -337,7 +373,6 @@ def _create_ancil_file(tmp_path, wmo_ids): True, True, # Remove target feature "percentile", - 5.62, ), ( _create_multi_site_forecast_parquet_file, @@ -346,7 +381,6 @@ def _create_ancil_file(tmp_path, wmo_ids): False, False, "realization", # Provide realization input - 5.62, ), ( _create_multi_forecast_period_forecast_parquet_file, @@ -355,11 +389,10 @@ def _create_ancil_file(tmp_path, wmo_ids): False, False, "percentile", - 5.64, ), ], ) -def test_load_and_train_qrf( +def test_load_for_qrf( tmp_path, forecast_creation, truth_creation, @@ -367,20 +400,31 @@ def test_load_and_train_qrf( include_static, remove_target, representation, - expected, ): """Test the LoadAndTrainQRF plugin.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} - n_estimators = 2 - max_depth = 5 - random_state = 46 - forecast_path, wmo_ids = forecast_creation(tmp_path, representation) - truth_path = truth_creation(tmp_path) + forecast_path, expected_forecast_df, wmo_ids = forecast_creation( + tmp_path, representation + ) + expected_forecast_df = amend_expected_forecast_df( + expected_forecast_df, + forecast_periods, + "temperature_at_screen_level", + "air_temperature", + ) + + truth_path, expected_truth_df = truth_creation(tmp_path) + expected_truth_df = amend_expected_truth_df( + expected_truth_df, "temperature_at_screen_level" + ) + file_paths = [forecast_path, truth_path] if include_static: - ancil_path = _create_ancil_file(tmp_path, sorted(list(set(wmo_ids)))) + ancil_path, expected_cube = _create_ancil_file( + tmp_path, sorted(list(set(wmo_ids))) + ) file_paths.append(ancil_path) feature_config["distance_to_water"] = ["static"] @@ -388,7 +432,7 @@ def test_load_and_train_qrf( feature_config.pop("air_temperature") # Create an instance of LoadAndTrainQRF with the required parameters - plugin = LoadAndTrainQRF( + plugin = LoadForQRF( experiment="latestblend", feature_config=feature_config, target_diagnostic_name="temperature_at_screen_level", @@ -396,40 +440,33 @@ def test_load_and_train_qrf( forecast_periods=forecast_periods, cycletime="20170103T0000Z", training_length=2, - n_estimators=n_estimators, - max_depth=max_depth, - random_state=random_state, - transformation="log", - pre_transform_addition=1, ) - qrf_model = plugin(file_paths) + forecast_df, truth_df, cube_inputs = plugin(file_paths) - assert qrf_model.n_estimators == n_estimators - assert qrf_model.max_depth == max_depth - assert qrf_model.random_state == random_state + assert isinstance(forecast_df, pd.DataFrame) + assert isinstance(truth_df, pd.DataFrame) - if remove_target: - current_forecast = [] - else: - current_forecast = [5.64, 3, 55] - - if include_static: - current_forecast.append(2.5) - - result = qrf_model.predict( - np.expand_dims(np.array(current_forecast), 0), quantiles=[0.5] + pd.testing.assert_frame_equal( + forecast_df, + expected_forecast_df, + check_dtype=False, + check_datetimelike_compat=True, ) - np.testing.assert_almost_equal(result, expected, decimal=2) + pd.testing.assert_frame_equal( + truth_df, expected_truth_df, check_dtype=False, check_datetimelike_compat=True + ) + if include_static: + assert isinstance(cube_inputs, iris.cube.CubeList) + assert len(cube_inputs) == 1 + assert cube_inputs[0].name() == "distance_to_water" + np.testing.assert_almost_equal(cube_inputs[0].data, expected_cube.data) @pytest.mark.parametrize("make_files", [(False, True)]) -def test_load_and_train_qrf_no_paths(tmp_path, make_files): +def test_load_for_qrf_no_paths(tmp_path, make_files): """Test the LoadAndTrainQRF plugin when the no valid file paths are provided. Either the paths do not exist, or the paths exist but the directories are empty.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} - n_estimators = 2 - max_depth = 5 - random_state = 46 file_paths = [ tmp_path / "partition" / "forecast_table/", @@ -439,7 +476,7 @@ def test_load_and_train_qrf_no_paths(tmp_path, make_files): for file_path in file_paths: (tmp_path / file_path).mkdir(parents=True, exist_ok=True) - plugin = LoadAndTrainQRF( + plugin = LoadForQRF( experiment="latestblend", feature_config=feature_config, target_diagnostic_name="temperature_at_screen_level", @@ -447,11 +484,6 @@ def test_load_and_train_qrf_no_paths(tmp_path, make_files): forecast_periods="6:12:6", cycletime="20170102T0000Z", training_length=2, - n_estimators=n_estimators, - max_depth=max_depth, - random_state=random_state, - transformation="log", - pre_transform_addition=1, ) result = plugin(file_paths) # Expecting None since no valid paths are provided @@ -459,26 +491,50 @@ def test_load_and_train_qrf_no_paths(tmp_path, make_files): @pytest.mark.parametrize( - "cycletime,forecast_periods", + "forecast_creation,truth_creation,cycletime,forecast_periods", [ - ("20200102T0000Z", "6:12:6"), - ("20170102T0000Z", "30:36:6"), + ( + _create_multi_site_forecast_parquet_file, + _create_multi_site_truth_parquet_file, + "20200102T0000Z", + "6:12:6", + ), + ( + _create_multi_site_forecast_parquet_file, + _create_multi_site_truth_parquet_file, + "20170102T0000Z", + "30:36:6", + ), ], ) -def test_load_and_train_qrf_mismatches(tmp_path, cycletime, forecast_periods): +def test_load_for_qrf_mismatches( + tmp_path, + forecast_creation, + truth_creation, + cycletime, + forecast_periods, +): """Test the LoadAndTrainQRF plugin when the cycletime or forecast_periods requested are not present in the provided files.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} - n_estimators = 2 - max_depth = 5 - random_state = 46 + representation = "percentile" + + forecast_path, expected_forecast_df, _ = forecast_creation(tmp_path, representation) + expected_forecast_df = amend_expected_forecast_df( + expected_forecast_df, + forecast_periods, + "temperature_at_screen_level", + "air_temperature", + ) - file_paths = [ - tmp_path / "partition" / "forecast_table/", - tmp_path / "partition" / "truth_table/", - ] + truth_path, expected_truth_df = truth_creation(tmp_path) + expected_truth_df = amend_expected_truth_df( + expected_truth_df, "temperature_at_screen_level" + ) + + file_paths = [forecast_path, truth_path] - plugin = LoadAndTrainQRF( + plugin = LoadForQRF( experiment="latestblend", feature_config=feature_config, target_diagnostic_name="temperature_at_screen_level", @@ -486,15 +542,11 @@ def test_load_and_train_qrf_mismatches(tmp_path, cycletime, forecast_periods): forecast_periods=forecast_periods, cycletime=cycletime, training_length=2, - n_estimators=n_estimators, - max_depth=max_depth, - random_state=random_state, - transformation="log", - pre_transform_addition=1, ) - result = plugin(file_paths) - # Expecting None since no valid paths are provided - assert result is None + forecast_df, _, _ = plugin(file_paths) + # Expecting an empty DataFrame since the cycletime or forecast_periods + # requested are not present in the provided file. + assert forecast_df.empty @pytest.mark.parametrize( @@ -554,16 +606,13 @@ def test_unexpected( ): """Test LoadAndTrainQRF plugin behaviour in atypical situations.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} - n_estimators = 2 - max_depth = 5 - random_state = 46 - forecast_path, _ = forecast_creation(tmp_path, representation) - truth_path = truth_creation(tmp_path) + forecast_path, _, _ = forecast_creation(tmp_path, representation) + truth_path, _ = truth_creation(tmp_path) file_paths = [forecast_path, truth_path] # Create an instance of LoadAndTrainQRF with the required parameters - plugin = LoadAndTrainQRF( + plugin = LoadForQRF( experiment="latestblend", feature_config=feature_config, target_diagnostic_name="temperature_at_screen_level", @@ -571,11 +620,6 @@ def test_unexpected( forecast_periods=forecast_periods, cycletime="20170103T0000Z", training_length=2, - n_estimators=n_estimators, - max_depth=max_depth, - random_state=random_state, - transformation="log", - pre_transform_addition=1, ) if exception == "non_matching_truth": @@ -587,7 +631,7 @@ def test_unexpected( "distance_to_water": ["static"], } plugin.feature_config = feature_config - with pytest.raises(ValueError, match="The number of cubes loaded."): + with pytest.raises(ValueError, match="The features requested in the"): plugin.process(file_paths=file_paths) elif exception == "missing_dynamic_feature": feature_config = { @@ -595,7 +639,7 @@ def test_unexpected( "air_temperature": ["mean", "std"], } plugin.feature_config = feature_config - with pytest.raises(ValueError, match="The number of cubes loaded."): + with pytest.raises(ValueError, match="The features requested in the"): plugin.process(file_paths=file_paths) elif exception == "no_percentile_realization": with pytest.raises(ValueError, match="The forecast parquet file"): @@ -609,3 +653,147 @@ def test_unexpected( assert result is None else: raise ValueError(f"Unknown exception type: {exception}") + + +@pytest.mark.parametrize( + "forecast_creation,truth_creation,forecast_periods,include_static,remove_target,representation,expected", + [ + ( + _create_multi_site_forecast_parquet_file, + _create_multi_site_truth_parquet_file, + "6:18:6", + False, + False, + "percentile", + 5.6, + ), + ( + _create_multi_site_forecast_parquet_file, + _create_multi_site_truth_parquet_file, + "6:18:6", + True, + False, + "percentile", + 5.64, + ), + ( + _create_multi_percentile_forecast_parquet_file, + _create_multi_percentile_truth_parquet_file, + "6:18:6", + False, + False, + "percentile", + 5.62, + ), + ( + _create_multi_percentile_forecast_parquet_file, + _create_multi_percentile_truth_parquet_file, + "6:18:6", + True, + False, + "percentile", + 5.62, + ), + ( + _create_multi_forecast_period_forecast_parquet_file, + _create_multi_forecast_period_truth_parquet_file, + "6:18:6", + False, + False, + "percentile", + 5.61, + ), + ( + _create_multi_forecast_period_forecast_parquet_file, + _create_multi_forecast_period_truth_parquet_file, + "6:18:6", + True, + True, # Remove target feature + "percentile", + 5.62, + ), + ( + _create_multi_site_forecast_parquet_file, + _create_multi_site_truth_parquet_file, + "6:18:6", + False, + False, + "realization", # Provide realization input + 5.62, + ), + ( + _create_multi_forecast_period_forecast_parquet_file, + _create_multi_forecast_period_truth_parquet_file, + "12", + False, + False, + "percentile", + 5.64, + ), + ], +) +def test_prepare_and_train_qrf( + tmp_path, + forecast_creation, + truth_creation, + forecast_periods, + include_static, + remove_target, + representation, + expected, +): + """Test the LoadAndTrainQRF plugin.""" + feature_config = {"air_temperature": ["mean", "std", "altitude"]} + n_estimators = 2 + max_depth = 5 + random_state = 46 + target_cf_name = "air_temperature" + + _, forecast_df, wmo_ids = forecast_creation(tmp_path, representation) + forecast_df = amend_expected_forecast_df( + forecast_df, + forecast_periods, + "temperature_at_screen_level", + target_cf_name, + ) + _, truth_df = truth_creation(tmp_path) + truth_df = amend_expected_truth_df(truth_df, "temperature_at_screen_level") + + if include_static: + _, ancil_cube = _create_ancil_file(tmp_path, sorted(list(set(wmo_ids)))) + feature_config["distance_to_water"] = ["static"] + + if remove_target: + feature_config.pop("air_temperature") + + # Create an instance of LoadAndTrainQRF with the required parameters + plugin = PrepareAndTrainQRF( + feature_config=feature_config, + target_cf_name=target_cf_name, + n_estimators=n_estimators, + max_depth=max_depth, + random_state=random_state, + transformation="log", + pre_transform_addition=1, + ) + if include_static: + qrf_model = plugin(forecast_df, truth_df, iris.cube.CubeList([ancil_cube])) + else: + qrf_model = plugin(forecast_df, truth_df) + + assert qrf_model.n_estimators == n_estimators + assert qrf_model.max_depth == max_depth + assert qrf_model.random_state == random_state + + if remove_target: + current_forecast = [] + else: + current_forecast = [5.64, 3, 55] + + if include_static: + current_forecast.append(2.5) + + result = qrf_model.predict( + np.expand_dims(np.array(current_forecast), 0), quantiles=[0.5] + ) + np.testing.assert_almost_equal(result, expected, decimal=2) From c5638abc9bedee8d632e153c7c0d13068dfd1421 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Fri, 12 Sep 2025 17:55:32 +0100 Subject: [PATCH 02/15] Edits to separate loading out of the apply QRF plugin and other tidy-ups. --- ...apply_quantile_regression_random_forest.py | 70 +++++++----------- ...train_quantile_regression_random_forest.py | 2 +- ...apply_quantile_regression_random_forest.py | 12 ++-- ...train_quantile_regression_random_forest.py | 15 ++-- ...apply_quantile_regression_random_forest.py | 72 +++++++------------ ...train_quantile_regression_random_forest.py | 28 ++++---- 6 files changed, 79 insertions(+), 120 deletions(-) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index 62f35a0465..2a94e1cfd3 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -6,11 +6,9 @@ """Script to load and apply the trained Quantile Regression Random Forest (QRF) model.""" -import pathlib from typing import Optional import iris -import joblib import numpy as np import pandas as pd from iris.cube import Cube, CubeList @@ -39,7 +37,7 @@ class RandomForestQuantileRegressor: iris.FUTURE.pandas_ndim = True -class LoadAndApplyQRF(PostProcessingPlugin): +class PrepareAndApplyQRF(PostProcessingPlugin): """Load and apply the trained Quantile Regression Random Forest (QRF) model.""" def __init__( @@ -87,40 +85,26 @@ def __init__( self.quantile_forest_installed = quantile_forest_package_available() def _get_inputs( - self, file_paths: list[pathlib.Path] - ) -> tuple[CubeList, Cube, RandomForestQuantileRegressor]: - """Get inputs from disk and separate the model and the features. + self, + cube_inputs: iris.cube.CubeList, + qrf_model: Optional[RandomForestQuantileRegressor] = None, + ) -> tuple[CubeList, Cube]: + """Split the forecast to be calibrated from the other features. Handle + the case where the qrf_model is not provided. In this case, the uncalibrated + forecast is returned with a warning comment added. Args: - file_paths: List of paths to the trained QRF model and the forecast to be - calibrated and the features, as required. + cube_inputs: List of cubes containing the features and the forecast to be + calibrated. Returns: - CubeList of the features cubes, the forecast cube, and the - trained QRF model. + CubeList of the features cubes and the forecast cube. Raises: - ValueError: If no QRF model is found in the provided file paths. - ValueError: If no features are found in the provided file paths. - ValueError: If the number of inputs does not match the number of file paths. + ValueError: If not target forecast is provided. + ValueError: If the number of cubes provided does not match the number of + features expected. """ - cube_inputs = iris.cube.CubeList([]) - qrf_model = None - - for file_path in file_paths: - try: - cube = iris.load_cube(file_path) - cube_inputs.append(cube) - except ValueError: - qrf_model = joblib.load(file_path) - - if not cube_inputs: - msg = ( - "No features found in the provided file paths. " - "At least one feature must be provided." - ) - raise ValueError(msg) - # Extract all additional cubes which are associated with a feature in the # feature_config. forecast_constraint = iris.Constraint(name=self.target_cf_name) @@ -130,15 +114,16 @@ def _get_inputs( (forecast_cube,) = forecast_cube else: msg = ( - "No target forecast provided. An input file representing the target " + "No target forecast provided. An input cube representing the target " "must be provided, even if the target will not be used as a feature. " f"The target is '{self.target_cf_name}'." ) raise ValueError(msg) if not qrf_model: + # If no model is provided, return the input forecast with a warning. forecast_cube = add_warning_comment(forecast_cube) - return None, forecast_cube, None + return None, forecast_cube if len(cube_inputs) != len(self.feature_config.keys()): msg = ( @@ -149,15 +134,11 @@ def _get_inputs( ) raise ValueError(msg) - if not qrf_model: - # The specified model doesn't exist and the forecast will not be calibrated - return forecast_cube - # If target diagnostic not a feature in the training then remove. if self.target_cf_name not in self.feature_config.keys(): cube_inputs.remove(forecast_cube) - return cube_inputs, forecast_cube, qrf_model + return cube_inputs, forecast_cube @staticmethod def _compute_percentiles(forecast_cube: Cube, coord: str) -> list[float]: @@ -239,7 +220,8 @@ def _cube_to_dataframe(cube_inputs: CubeList) -> pd.DataFrame: def process( self, - file_paths: list[pathlib.Path], + cube_inputs: CubeList, + qrf_model: Optional[RandomForestQuantileRegressor], ) -> Cube: """Load and applying the trained Quantile Regression Random Forest (QRF) model. The model is applied to the forecast supplied to calibrate the forecast. @@ -247,18 +229,16 @@ def process( input forecast is returned unchanged. Args: - file_paths (cli.inputpaths): - A list of input paths containing: - - The path to a QRF trained model in pickle file format to be used - for calibration. - - The path to a NetCDF file containing the forecast to be calibrated. - - Optionally, paths to NetCDF files containing additional predictors. + cube_inputs: List of cubes containing the features and the forecast to be + calibrated. + qrf_model: The trained QRF model to be applied to the forecast. Returns: iris.cube.Cube: The calibrated forecast cube. """ - cube_inputs, forecast_cube, qrf_model = self._get_inputs(file_paths) + cube_inputs, forecast_cube = self._get_inputs(cube_inputs, qrf_model=qrf_model) + if not self.quantile_forest_installed: return forecast_cube if not qrf_model: diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 5d06201d7a..933509f020 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -30,7 +30,7 @@ iris.FUTURE.pandas_ndim = True -class LoadForQRF(PostProcessingPlugin): +class LoadForTrainQRF(PostProcessingPlugin): """Plugin to load input files for training a Quantile Regression Random Forest (QRF) model.""" diff --git a/improver/cli/apply_quantile_regression_random_forest.py b/improver/cli/apply_quantile_regression_random_forest.py index c7a6fec8d4..cf8d6a13c1 100644 --- a/improver/cli/apply_quantile_regression_random_forest.py +++ b/improver/cli/apply_quantile_regression_random_forest.py @@ -59,17 +59,17 @@ def process( iris.cube.Cube: The calibrated forecast cube. """ - + from improver.calibration import split_pickle_parquet_and_netcdf from improver.calibration.load_and_apply_quantile_regression_random_forest import ( - LoadAndApplyQRF, + PrepareAndApplyQRF, ) - result = LoadAndApplyQRF( + cubes, _, qrf_model = split_pickle_parquet_and_netcdf(file_paths) + + result = PrepareAndApplyQRF( feature_config=feature_config, target_cf_name=target_cf_name, transformation=transformation, pre_transform_addition=pre_transform_addition, - )( - file_paths, - ) + )(cubes, qrf_model=qrf_model) return result diff --git a/improver/cli/train_quantile_regression_random_forest.py b/improver/cli/train_quantile_regression_random_forest.py index eb87faf018..383af9f1bb 100644 --- a/improver/cli/train_quantile_regression_random_forest.py +++ b/improver/cli/train_quantile_regression_random_forest.py @@ -97,24 +97,27 @@ def process( """ from improver.calibration.load_and_train_quantile_regression_random_forest import ( - LoadAndTrainQRF, + LoadForTrainQRF, + PrepareAndTrainQRF, ) - result = LoadAndTrainQRF( + forecast_df, truth_df, cube_inputs = LoadForTrainQRF( experiment=experiment, feature_config=feature_config, target_diagnostic_name=target_diagnostic_name, target_cf_name=target_cf_name, forecast_periods=forecast_periods, cycletime=cycletime, - training_length=training_length, + )(file_paths) + result = PrepareAndTrainQRF( + feature_config=feature_config, + target_cf_name=target_cf_name, n_estimators=n_estimators, max_depth=max_depth, max_samples=max_samples, random_state=random_state, transformation=transformation, pre_transform_addition=pre_transform_addition, - )( - file_paths, - ) + )(forecast_df, truth_df, cube_inputs) + return result diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py index 9fbeaced5f..dfcbef2175 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py @@ -2,20 +2,19 @@ # # This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. # See LICENSE in the root of the repository for full licensing details. -"""Unit tests for the LoadAndApplyQRF plugin.""" +"""Unit tests for the PrepareAndApplyQRF plugin.""" import numpy as np import pytest from iris.coords import AuxCoord -from iris.cube import Cube +from iris.cube import Cube, CubeList from improver.calibration.load_and_apply_quantile_regression_random_forest import ( - LoadAndApplyQRF, + PrepareAndApplyQRF, ) from improver.ensemble_copula_coupling.ensemble_copula_coupling import ( RebadgeRealizationsAsPercentiles, ) -from improver.utilities.save import save_netcdf from improver_tests.calibration.quantile_regression_random_forests_calibration.test_quantile_regression_random_forest import ( _create_ancil_file, _create_forecasts, @@ -60,8 +59,7 @@ def _add_day_of_training_period_to_cube(cube, day_of_training_period, secondary_ (2, 5, 55, 5, None, 0, {}, True, [0.5], [4.1, 5.65]), # noqa Include an additional static feature ], ) -def test_load_and_apply_qrf( - tmp_path, +def test_prepare_and_apply_qrf( percentile_input, n_estimators, max_depth, @@ -74,10 +72,10 @@ def test_load_and_apply_qrf( quantiles, expected, ): - """Test the LoadAndApplyQRF plugin.""" + """Test the PrepareAndApplyQRF plugin.""" feature_config = {"wind_speed_at_10m": ["mean", "std", "latitude", "longitude"]} - model_output = _run_train_qrf( + qrf_model = _run_train_qrf( feature_config, n_estimators, max_depth, @@ -97,13 +95,13 @@ def test_load_and_apply_qrf( ], realization_data=[2, 6, 10], truth_data=[4.2, 6.2, 4.1, 5.1], - tmp_path=tmp_path, ) frt = "20170103T0000Z" vt = "20170103T1200Z" data = np.arange(6, (len(quantiles) * 6) + 1, 6) day_of_training_period = 2 + cube_inputs = CubeList() forecast_cube = _create_forecasts(frt, vt, data, return_cube=True) forecast_cube = _add_day_of_training_period_to_cube( @@ -113,26 +111,18 @@ def test_load_and_apply_qrf( if percentile_input: forecast_cube = RebadgeRealizationsAsPercentiles()(forecast_cube) - features_dir = tmp_path / "features" - features_dir.mkdir(parents=True) - forecast_filepath = str(features_dir / "forecast.nc") - save_netcdf(forecast_cube, forecast_filepath) - - file_paths = [model_output, forecast_filepath] - + cube_inputs.append(forecast_cube) if include_static: ancil_cube = _create_ancil_file(return_cube=True) - ancil_filepath = features_dir / "ancil.nc" - save_netcdf(ancil_cube, ancil_filepath) - file_paths.append(str(ancil_filepath)) + cube_inputs.append(ancil_cube) - plugin = LoadAndApplyQRF( + # cube_inputs, qrf_model = LoadForApplyQRF()(file_paths) + result = PrepareAndApplyQRF( feature_config, "wind_speed_at_10m", transformation=transformation, pre_transform_addition=pre_transform_addition, - ) - result = plugin.process(file_paths=file_paths) + )(cube_inputs, qrf_model) assert isinstance(result, Cube) assert result.data.shape == (len(quantiles), 2) @@ -165,10 +155,9 @@ def test_load_and_apply_qrf( ], ) def test_unexpected( - tmp_path, exception, ): - """Test LoadAndApplyQRF plugin behaviour in atypical situations.""" + """Test PrepareAndApplyQRF plugin behaviour in atypical situations.""" feature_config = {"wind_speed_at_10m": ["mean", "std", "latitude", "longitude"]} n_estimators = 2 @@ -181,7 +170,7 @@ def test_unexpected( include_static = False quantiles = [0.5] - model_output = _run_train_qrf( + qrf_model = _run_train_qrf( feature_config, n_estimators, max_depth, @@ -201,7 +190,6 @@ def test_unexpected( ], realization_data=[2, 6, 10], truth_data=[4.2, 6.2, 4.1, 5.1], - tmp_path=tmp_path, ) frt = "20170103T0000Z" @@ -212,17 +200,11 @@ def test_unexpected( forecast_cube = _add_day_of_training_period_to_cube( forecast_cube, day_of_training_period, "forecast_reference_time" ) - + cube_inputs = CubeList([forecast_cube]) ancil_cube = _create_ancil_file(return_cube=True) - ancil_filepath = tmp_path / "ancil.nc" - save_netcdf(ancil_cube, ancil_filepath) - - features_dir = tmp_path / "features" - features_dir.mkdir(parents=True) - forecast_filepath = str(features_dir / "forecast.nc") - save_netcdf(forecast_cube, forecast_filepath) + cube_inputs.append(ancil_cube) - plugin = LoadAndApplyQRF( + plugin = PrepareAndApplyQRF( feature_config, "wind_speed_at_10m", transformation=transformation, @@ -230,43 +212,37 @@ def test_unexpected( ) if exception == "no_model_output": - file_paths = [forecast_filepath] - result = plugin.process(file_paths=file_paths) + result = plugin(cube_inputs, qrf_model=None) assert isinstance(result, Cube) assert result.name() == "wind_speed_at_10m" assert result.units == "m s-1" assert result.data.shape == forecast_cube.data.shape assert np.allclose(result.data, forecast_cube.data) elif exception == "no_features": - file_paths = [model_output] - with pytest.raises(ValueError, match="No features found"): - plugin.process(file_paths=file_paths) + with pytest.raises(ValueError, match="No target forecast provided."): + plugin(CubeList(), qrf_model=qrf_model) elif exception == "missing_target_feature": - file_paths = [model_output, ancil_filepath] with pytest.raises(ValueError, match="No target forecast provided."): - plugin.process(file_paths=file_paths) + plugin(CubeList([ancil_cube]), qrf_model=qrf_model) elif exception == "missing_static_feature": feature_config = { "wind_speed_at_10m": ["mean", "std"], "distance_to_water": ["static"], } plugin.feature_config = feature_config - file_paths = [model_output, forecast_filepath] with pytest.raises(ValueError, match="The number of cubes loaded."): - plugin.process(file_paths=file_paths) + plugin(CubeList([forecast_cube]), qrf_model=qrf_model) elif exception == "missing_dynamic_feature": feature_config = { "wind_speed_at_10m": ["mean", "std"], "air_temperature": ["mean", "std"], } plugin.feature_config = feature_config - file_paths = [model_output, forecast_filepath] with pytest.raises(ValueError, match="The number of cubes loaded."): - plugin.process(file_paths=file_paths) + plugin(CubeList([forecast_cube]), qrf_model=qrf_model) elif exception == "no_quantile_forest_package": plugin.quantile_forest_installed = False - file_paths = [model_output, forecast_filepath] - result = plugin.process(file_paths=file_paths) + result = plugin(CubeList([forecast_cube]), qrf_model=qrf_model) assert isinstance(result, Cube) assert result.name() == "wind_speed_at_10m" assert result.units == "m s-1" diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py index 3108c265fb..463bf76d3f 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py @@ -2,7 +2,7 @@ # # This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. # See LICENSE in the root of the repository for full licensing details. -"""Unit tests for the LoadForQRF and PrepareAndTrain QRF plugins.""" +"""Unit tests for the LoadForTrainQRF and PrepareAndTrain QRF plugins.""" import iris import numpy as np @@ -10,7 +10,7 @@ import pytest from improver.calibration.load_and_train_quantile_regression_random_forest import ( - LoadForQRF, + LoadForTrainQRF, PrepareAndTrainQRF, ) from improver.synthetic_data.set_up_test_cubes import set_up_spot_variable_cube @@ -401,7 +401,7 @@ def test_load_for_qrf( remove_target, representation, ): - """Test the LoadAndTrainQRF plugin.""" + """Test the LoadForTrainQRF plugin.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} forecast_path, expected_forecast_df, wmo_ids = forecast_creation( @@ -431,8 +431,8 @@ def test_load_for_qrf( if remove_target: feature_config.pop("air_temperature") - # Create an instance of LoadAndTrainQRF with the required parameters - plugin = LoadForQRF( + # Create an instance of LoadForTrainQRF with the required parameters + plugin = LoadForTrainQRF( experiment="latestblend", feature_config=feature_config, target_diagnostic_name="temperature_at_screen_level", @@ -464,7 +464,7 @@ def test_load_for_qrf( @pytest.mark.parametrize("make_files", [(False, True)]) def test_load_for_qrf_no_paths(tmp_path, make_files): - """Test the LoadAndTrainQRF plugin when the no valid file paths are provided. + """Test the LoadForTrainQRF plugin when the no valid file paths are provided. Either the paths do not exist, or the paths exist but the directories are empty.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} @@ -476,7 +476,7 @@ def test_load_for_qrf_no_paths(tmp_path, make_files): for file_path in file_paths: (tmp_path / file_path).mkdir(parents=True, exist_ok=True) - plugin = LoadForQRF( + plugin = LoadForTrainQRF( experiment="latestblend", feature_config=feature_config, target_diagnostic_name="temperature_at_screen_level", @@ -514,7 +514,7 @@ def test_load_for_qrf_mismatches( cycletime, forecast_periods, ): - """Test the LoadAndTrainQRF plugin when the cycletime or forecast_periods + """Test the LoadForTrainQRF plugin when the cycletime or forecast_periods requested are not present in the provided files.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} representation = "percentile" @@ -534,7 +534,7 @@ def test_load_for_qrf_mismatches( file_paths = [forecast_path, truth_path] - plugin = LoadForQRF( + plugin = LoadForTrainQRF( experiment="latestblend", feature_config=feature_config, target_diagnostic_name="temperature_at_screen_level", @@ -604,15 +604,15 @@ def test_unexpected( forecast_periods, representation, ): - """Test LoadAndTrainQRF plugin behaviour in atypical situations.""" + """Test LoadForTrainQRF plugin behaviour in atypical situations.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} forecast_path, _, _ = forecast_creation(tmp_path, representation) truth_path, _ = truth_creation(tmp_path) file_paths = [forecast_path, truth_path] - # Create an instance of LoadAndTrainQRF with the required parameters - plugin = LoadForQRF( + # Create an instance of LoadForTrainQRF with the required parameters + plugin = LoadForTrainQRF( experiment="latestblend", feature_config=feature_config, target_diagnostic_name="temperature_at_screen_level", @@ -742,7 +742,7 @@ def test_prepare_and_train_qrf( representation, expected, ): - """Test the LoadAndTrainQRF plugin.""" + """Test the PrepareAndTrainQRF plugin.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} n_estimators = 2 max_depth = 5 @@ -766,7 +766,7 @@ def test_prepare_and_train_qrf( if remove_target: feature_config.pop("air_temperature") - # Create an instance of LoadAndTrainQRF with the required parameters + # Create an instance of PrepareAndTrainQRF with the required parameters plugin = PrepareAndTrainQRF( feature_config=feature_config, target_cf_name=target_cf_name, From 763a5e2d951c428a5896b2ee24d84df5b0612aae Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Fri, 12 Sep 2025 18:45:27 +0100 Subject: [PATCH 03/15] Further edits following review comments. --- ...apply_quantile_regression_random_forest.py | 1 - ...train_quantile_regression_random_forest.py | 38 +++++++++++++++++-- ...apply_quantile_regression_random_forest.py | 4 +- ...train_quantile_regression_random_forest.py | 24 ++++++++---- improver/utilities/compare.py | 16 +++++++- 5 files changed, 68 insertions(+), 15 deletions(-) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index 2a94e1cfd3..549fa00ca5 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -77,7 +77,6 @@ def __init__( Value to be added before transformation. """ - self.feature_config = feature_config self.target_cf_name = target_cf_name self.transformation = transformation diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 933509f020..31704a846b 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -44,7 +44,25 @@ def __init__( training_length: int, experiment: Optional[str] = None, ): - """Initialise the LoadForQRF plugin.""" + """Initialise the LoadForQRF plugin. + + Args: + feature_config: Feature configuration defining the features to be used for + Quantile Regression Random Forests. + target_diagnostic_name: A string containing the diagnostic name of the + forecast to be calibrated. This will be used to filter the target + forecast and truth dataframes. This could be different from the CF name + e.g. 'temperature_at_screen_level'. + target_cf_name: A string containing the CF name of the forecast to be + calibrated e.g. air_temperature. + forecast_periods: Range of forecast periods to be calibrated in hours in + the form: "start:end:interval" e.g. "6:18:6" or a single forecast period + e.g. "6". + cycletime: The time at which the forecast is valid in the form: + YYYYMMDDTHHMMZ. + training_length: The number of days of training data to use. + experiment: The name of the experiment (step) that calibration is applied to. + """ self.quantile_forest_installed = quantile_forest_package_available() self.feature_config = feature_config self.target_diagnostic_name = target_diagnostic_name @@ -250,7 +268,21 @@ def __init__( transformation: Optional[str] = None, pre_transform_addition: float = 0, ): - """Initialise the PrepareAndTrainQRF plugin.""" + """Initialise the PrepareAndTrainQRF plugin. + + Args: + feature_config: Feature configuration defining the features to be used for + Quantile Regression Random Forests. + target_cf_name: A string containing the CF name of the forecast to be + calibrated e.g. air_temperature. + n_estimators: The number of trees in the forest. + max_depth: The maximum depth of the trees. + max_samples: The maximum number of samples to draw from the total number of + samples to train each tree. + random_state: Seed used by the random number generator. + transformation: Transformation to be applied to the data before fitting. + pre_transform_addition: Value to be added before transformation. + """ self.feature_config = feature_config self.target_cf_name = target_cf_name self.n_estimators = n_estimators @@ -338,7 +370,7 @@ def process( truth_df: pd.DataFrame, cube_inputs: Optional[iris.cube.CubeList] = None, ) -> None: - """Load input files and training a Quantile Regression Random Forest (QRF) + """Load input files and train a Quantile Regression Random Forest (QRF) model. This model can be applied later to calibrate the forecast. Two sources of input data must be provided: historical forecasts and historical truth data (to use in calibration). The model is output as a pickle file. diff --git a/improver/cli/apply_quantile_regression_random_forest.py b/improver/cli/apply_quantile_regression_random_forest.py index cf8d6a13c1..a18c15f360 100644 --- a/improver/cli/apply_quantile_regression_random_forest.py +++ b/improver/cli/apply_quantile_regression_random_forest.py @@ -49,8 +49,8 @@ def process( } target_cf_name (str): A string containing the CF name of the forecast to be - calibrated. This will be used to separate it from the rest of the - feature cubes, if present. + calibrated e.g. air_temperature. This will be used to separate it from + the rest of the feature cubes, if present. transformation (str): Transformation to be applied to the data before fitting. pre_transform_addition (float): diff --git a/improver/cli/train_quantile_regression_random_forest.py b/improver/cli/train_quantile_regression_random_forest.py index 383af9f1bb..609cc043a8 100644 --- a/improver/cli/train_quantile_regression_random_forest.py +++ b/improver/cli/train_quantile_regression_random_forest.py @@ -62,9 +62,11 @@ def process( target_diagnostic_name (str): A string containing the diagnostic name of the forecast to be calibrated. This will be used to filter the target forecast and truth - dataframes. + dataframes. This could be different from the CF name e.g. + 'temperature_at_screen_level'. target_cf_name (str): - A string containing the CF name of the forecast to be calibrated. + A string containing the CF name of the forecast to be calibrated + e.g. air_temperature. forecast_periods (str): Range of forecast periods to be calibrated in hours in the form: "start:end:interval" e.g. "6:18:6" or a single forecast period e.g. "6". @@ -81,10 +83,16 @@ def process( max_depth (int): Maximum depth of the tree. max_samples (float): - If an int, then it is the number of samples to draw to train - each tree. If a float, then it is the fraction of samples to draw - to train each tree. If None, then each tree contains the same - total number of samples as originally provided. + If an int, then it is the number of samples to draw from the total number + of samples available to train each tree. Note that a 'sample' refers to + each row within the DataFrames constructed where each row will differ + primarily based on the site, forecast period, forecast reference time and + realization or percentile. If a float, then it is the fraction of samples + to draw from the total number of samples available to train each tree. + If None, then each tree contains the same number of samples as the total + available. The trees will therefore only differ due to the use of + bootstrapping (i.e. sampling with replacement) when creating the + each tree. random_state (int): Random seed for reproducibility. transformation (str): @@ -92,8 +100,7 @@ def process( pre_transform_addition (float): Value to be added before transformation. Returns: - None: - The function creates a pickle file. + A quantile regression random forest model. """ from improver.calibration.load_and_train_quantile_regression_random_forest import ( @@ -108,6 +115,7 @@ def process( target_cf_name=target_cf_name, forecast_periods=forecast_periods, cycletime=cycletime, + training_length=training_length, )(file_paths) result = PrepareAndTrainQRF( feature_config=feature_config, diff --git a/improver/utilities/compare.py b/improver/utilities/compare.py index 598349c85e..819472c427 100644 --- a/improver/utilities/compare.py +++ b/improver/utilities/compare.py @@ -422,4 +422,18 @@ def raise_reporter(message): # call the reporter function outside the except block to avoid nested # exceptions if the reporter function is raising an exception if difference_found: - reporter("different pickled forest") + msg = ( + "Different pickled forest. \nThe output is: " + "n_features: {output.n_features_in_}, " + "n_outputs: {output.n_outputs_}, " + "max_depth: {output.max_depth}, " + "n_estimators: {output.n_estimators}, " + "random_state: {output.random_state}." + "\nThe KGO is: " + "n_features: {kgo.n_features_in_}, " + "n_outputs: {kgo.n_outputs_}, " + "max_depth: {kgo.max_depth}, " + "n_estimators: {kgo.n_estimators}, " + "random_state: {kgo.random_state}" + ) + reporter(msg) From 09b4e223ec692d3d930aea0374fd3c72081b1b66 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Tue, 16 Sep 2025 11:30:45 +0100 Subject: [PATCH 04/15] Commit edits to support using dynamic features for training. --- improver/calibration/__init__.py | 23 ++ ...apply_quantile_regression_random_forest.py | 26 +- ...train_quantile_regression_random_forest.py | 92 ++++-- .../quantile_regression_random_forest.py | 8 +- .../estimate_emos_coefficients_from_table.py | 10 +- ...train_quantile_regression_random_forest.py | 12 +- ...apply_quantile_regression_random_forest.py | 26 +- ...train_quantile_regression_random_forest.py | 279 +++++++++--------- improver_tests/calibration/test_init.py | 27 ++ 9 files changed, 300 insertions(+), 203 deletions(-) diff --git a/improver/calibration/__init__.py b/improver/calibration/__init__.py index 5b5dc4bbf3..b498168829 100644 --- a/improver/calibration/__init__.py +++ b/improver/calibration/__init__.py @@ -12,6 +12,7 @@ import iris import joblib +import pandas as pd import pyarrow.parquet as pq from iris.cube import Cube, CubeList @@ -387,3 +388,25 @@ def add_warning_comment(forecast: Cube) -> Cube: "however, no calibration has been applied." ) return forecast + + +def get_training_period_cycles( + cycletime: str, forecast_period: Union[int, str], training_length: int +): + """Generate a list of forecast reference times for the training period. + + Args: + cycletime: The time at which the forecast is issued in a format understood by + pandas.Timestamp e.g. 20170109T0000Z. + forecast_period: The forecast period in seconds. + training_length: The number of days in the training period. + """ + forecast_period_td = pd.Timedelta(int(forecast_period), unit="seconds") + + return pd.date_range( + end=pd.Timestamp(cycletime) + - pd.Timedelta(1, unit="days") + - forecast_period_td.floor("D"), + periods=int(training_length), + freq="D", + ) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index 549fa00ca5..8a3c8f0371 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -44,8 +44,6 @@ def __init__( self, feature_config: dict[str, list[str]], target_cf_name: str, - transformation: Optional[str] = None, - pre_transform_addition: Optional[float] = None, ): """Initialise the plugin. @@ -71,16 +69,9 @@ def __init__( A string containing the CF name of diagnostic to be calibrated. This will be used to separate it from the rest of the dynamic predictors, if present. - transformation (str): - Transformation to be applied to the data before fitting. - pre_transform_addition (float): - Value to be added before transformation. - """ self.feature_config = feature_config self.target_cf_name = target_cf_name - self.transformation = transformation - self.pre_transform_addition = pre_transform_addition self.quantile_forest_installed = quantile_forest_package_available() def _get_inputs( @@ -220,7 +211,9 @@ def _cube_to_dataframe(cube_inputs: CubeList) -> pd.DataFrame: def process( self, cube_inputs: CubeList, - qrf_model: Optional[RandomForestQuantileRegressor], + qrf_descriptors: Optional[ + tuple[RandomForestQuantileRegressor, str, float] + ] = None, ) -> Cube: """Load and applying the trained Quantile Regression Random Forest (QRF) model. The model is applied to the forecast supplied to calibrate the forecast. @@ -230,12 +223,19 @@ def process( Args: cube_inputs: List of cubes containing the features and the forecast to be calibrated. - qrf_model: The trained QRF model to be applied to the forecast. + qrf_descriptors: The trained QRF model to be applied to the forecast + and the transformation and pre-transform addition applied during + training. Returns: iris.cube.Cube: The calibrated forecast cube. """ + if qrf_descriptors is None: + # If no descriptors are provided, return the input forecast with a warning. + # Descriptors expected: (qrf_model, transformation, pre_transform_addition) + qrf_descriptors = (None, None, 0) + qrf_model, transformation, pre_transform_addition = qrf_descriptors cube_inputs, forecast_cube = self._get_inputs(cube_inputs, qrf_model=qrf_model) if not self.quantile_forest_installed: @@ -256,8 +256,8 @@ def process( target_name=self.target_cf_name, feature_config=self.feature_config, quantiles=percentiles, - transformation=self.transformation, - pre_transform_addition=self.pre_transform_addition, + transformation=transformation, + pre_transform_addition=pre_transform_addition, )(qrf_model, df) calibrated_forecast_cube = template_forecast_cube.copy( data=np.broadcast_to(calibrated_forecast.T, template_forecast_cube.shape) diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 31704a846b..ba9a9e3d4f 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -19,6 +19,7 @@ from improver import PostProcessingPlugin from improver.calibration import ( CalibrationSchemas, + get_training_period_cycles, identify_parquet_type, split_pickle_parquet_and_netcdf, ) @@ -27,6 +28,14 @@ quantile_forest_package_available, ) +try: + from quantile_forest import RandomForestQuantileRegressor +except ModuleNotFoundError: + # Define empty class to avoid type hint errors. + class RandomForestQuantileRegressor: + pass + + iris.FUTURE.pandas_ndim = True @@ -37,7 +46,7 @@ class LoadForTrainQRF(PostProcessingPlugin): def __init__( self, feature_config: dict[str, list[str]], - target_diagnostic_name: str, + parquet_diagnostic_names: str, target_cf_name: str, forecast_periods: str, cycletime: str, @@ -49,10 +58,11 @@ def __init__( Args: feature_config: Feature configuration defining the features to be used for Quantile Regression Random Forests. - target_diagnostic_name: A string containing the diagnostic name of the - forecast to be calibrated. This will be used to filter the target - forecast and truth dataframes. This could be different from the CF name - e.g. 'temperature_at_screen_level'. + parquet_diagnostic_names (str): + A string containing the diagnostic name that will be used for filtering + the target diagnostic from the forecast and truth DataFrames read in + from the parquet files. This could be different from the CF name e.g. + 'temperature_at_screen_level'. target_cf_name: A string containing the CF name of the forecast to be calibrated e.g. air_temperature. forecast_periods: Range of forecast periods to be calibrated in hours in @@ -65,7 +75,7 @@ def __init__( """ self.quantile_forest_installed = quantile_forest_package_available() self.feature_config = feature_config - self.target_diagnostic_name = target_diagnostic_name + self.parquet_diagnostic_names = parquet_diagnostic_names self.target_cf_name = target_cf_name self.forecast_periods = forecast_periods self.cycletime = cycletime @@ -124,22 +134,16 @@ def _read_parquet_files( cycletimes = [] for forecast_period in forecast_periods: # Load forecasts from parquet file filtering by diagnostic and blend_time. - forecast_period_td = pd.Timedelta(int(forecast_period), unit="seconds") - cycletimes.extend( - pd.date_range( - end=pd.Timestamp(self.cycletime) - - pd.Timedelta(1, unit="days") - - forecast_period_td.floor("D"), - periods=int(self.training_length), - freq="D", + get_training_period_cycles( + self.cycletime, forecast_period, self.training_length ) ) cycletimes = list(set(cycletimes)) filters = [ [ - ("diagnostic", "==", self.target_diagnostic_name), + ("diagnostic", "in", self.parquet_diagnostic_names), ("blend_time", "in", cycletimes), ("experiment", "==", self.experiment), ] @@ -181,10 +185,41 @@ def _read_parquet_files( forecast_df[column] = pd.to_timedelta(forecast_df[column], unit="ns") forecast_df[column] = forecast_df[column].astype("timedelta64[ns]") - forecast_df = forecast_df.rename(columns={"forecast": self.target_cf_name}) + # Convert additional features from rows to columns. + representation = ( + "percentile" if "percentile" in forecast_df.columns else "realization" + ) + base_df = forecast_df[ + forecast_df["diagnostic"] == self.parquet_diagnostic_names[0] + ] + for parquet_diagnostic_name in self.parquet_diagnostic_names[1:]: + additional_df = forecast_df[ + forecast_df["diagnostic"] == parquet_diagnostic_name + ] + base_df = pd.merge( + base_df, + additional_df[ + [ + "wmo_id", + "forecast_reference_time", + "forecast_period", + representation, + "forecast", + ] + ].rename(columns={"forecast": parquet_diagnostic_name}), + on=[ + "wmo_id", + "forecast_reference_time", + "forecast_period", + representation, + ], + how="left", + ) + forecast_df = base_df + forecast_df.rename(columns={"forecast": self.target_cf_name}, inplace=True) # Load truths from parquet file filtering by diagnostic. - filters = [[("diagnostic", "==", self.target_diagnostic_name)]] + filters = [[("diagnostic", "==", self.parquet_diagnostic_names[0])]] truth_df = pd.read_parquet( truth_table_path, filters=filters, @@ -314,10 +349,12 @@ def _check_matching_times( ) ) - def _add_features_to_df( + def _add_static_features_from_cube_to_df( self, forecast_df: pd.DataFrame, cube_inputs: iris.cube.CubeList ) -> pd.DataFrame: - """Add features to the forecast DataFrame based on the feature configuration. + """Add features to the forecast DataFrame from cubes based on the feature + configuration. Other features are expected to already be present in the + forecast DataFrame. Args: forecast_df: DataFrame containing the forecast data. @@ -326,11 +363,17 @@ def _add_features_to_df( Returns: DataFrame with additional features added. """ + if cube_inputs is None or len(cube_inputs) == 0: + return forecast_df for feature_name, feature_list in self.feature_config.items(): for feature in feature_list: if feature == "static": # Use the cube's data directly as a feature. constr = iris.Constraint(name=feature_name) + # Static features can be provided either as a cube or as a column + # in the forecast DataFrame. + if not cube_inputs.extract(constr): + continue feature_cube = cube_inputs.extract_cube(constr) feature_df = as_data_frame(feature_cube, add_aux_coords=True) forecast_df = forecast_df.merge( @@ -369,7 +412,7 @@ def process( forecast_df: pd.DataFrame, truth_df: pd.DataFrame, cube_inputs: Optional[iris.cube.CubeList] = None, - ) -> None: + ) -> Optional[tuple[RandomForestQuantileRegressor, str, float]]: """Load input files and train a Quantile Regression Random Forest (QRF) model. This model can be applied later to calibrate the forecast. Two sources of input data must be provided: historical forecasts and historical truth data @@ -391,7 +434,9 @@ def process( if len(intersecting_times) == 0: return None - forecast_df = self._add_features_to_df(forecast_df, cube_inputs) + forecast_df = self._add_static_features_from_cube_to_df( + forecast_df, cube_inputs + ) forecast_df, truth_df = self.filter_bad_sites(forecast_df, truth_df) result = TrainQuantileRegressionRandomForests( @@ -404,4 +449,7 @@ def process( transformation=self.transformation, pre_transform_addition=self.pre_transform_addition, )(forecast_df, truth_df) - return result + + # Create a tuple that returns the model, transformation and + # pre_transform_addition to allow these to be saved together. + return (result, self.transformation, self.pre_transform_addition) diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index 48c10c041a..5768eccb12 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -287,7 +287,7 @@ def process( self, forecast_df: pd.DataFrame, truth_df: pd.DataFrame, - ) -> None: + ) -> RandomForestQuantileRegressor: """Train a quantile regression random forests model. Args: @@ -356,7 +356,7 @@ def __init__( feature_config: dict[str, list[str]], quantiles: list[np.float32], transformation: str = None, - pre_transform_addition: np.float32 = 0, + pre_transform_addition: np.float32 = 0, ) -> None: """Initialise the plugin. @@ -384,7 +384,7 @@ def __init__( transformation (str): Transformation to be applied to the data before fitting. pre_transform_addition (float): - Value to be added before transformation. + Value to be added before transformation. Raises: ValueError: If the transformation is not one of the supported types. @@ -395,7 +395,7 @@ def __init__( self.quantiles = quantiles self.transformation = transformation _check_valid_transformation(self.transformation) - self.pre_transform_addition = pre_transform_addition + self.pre_transform_addition = pre_transform_addition def _reverse_transformation(self, forecast: np.ndarray) -> np.ndarray: """Reverse the transformation applied to the data prior to fitting the QRF. diff --git a/improver/cli/estimate_emos_coefficients_from_table.py b/improver/cli/estimate_emos_coefficients_from_table.py index 8e7342f795..c78ff9aa9f 100755 --- a/improver/cli/estimate_emos_coefficients_from_table.py +++ b/improver/cli/estimate_emos_coefficients_from_table.py @@ -120,6 +120,7 @@ def process( import pandas as pd from iris.cube import CubeList + from improver.calibration import get_training_period_cycles from improver.calibration.dataframe_utilities import ( forecast_and_truth_dataframes_to_cubes, ) @@ -128,14 +129,7 @@ def process( ) # Load forecasts from parquet file filtering by diagnostic and blend_time. - forecast_period_td = pd.Timedelta(int(forecast_period), unit="seconds") - cycletimes = pd.date_range( - end=pd.Timestamp(cycletime) - - pd.Timedelta(1, unit="days") - - forecast_period_td.floor("D"), - periods=int(training_length), - freq="D", - ) + cycletimes = get_training_period_cycles(cycletime, forecast_period, training_length) filters = [[("diagnostic", "==", diagnostic), ("blend_time", "in", cycletimes)]] forecast_df = pd.read_parquet(forecast, filters=filters) diff --git a/improver/cli/train_quantile_regression_random_forest.py b/improver/cli/train_quantile_regression_random_forest.py index 609cc043a8..87c28127a7 100644 --- a/improver/cli/train_quantile_regression_random_forest.py +++ b/improver/cli/train_quantile_regression_random_forest.py @@ -13,7 +13,7 @@ def process( *file_paths: cli.inputpath, feature_config: cli.inputjson, - target_diagnostic_name: str, + parquet_diagnostic_names: cli.comma_separated_list, target_cf_name: str, forecast_periods: str, cycletime: str, @@ -59,10 +59,10 @@ def process( "visibility_at_screen_level": ["mean", "std"] "distance_to_water": ["static"], } - target_diagnostic_name (str): - A string containing the diagnostic name of the forecast to be - calibrated. This will be used to filter the target forecast and truth - dataframes. This could be different from the CF name e.g. + parquet_diagnostic_names (str): + A string containing the diagnostic name that will be used for filtering + the target diagnostic from the forecast and truth DataFrames read in + from the parquet files. This could be different from the CF name e.g. 'temperature_at_screen_level'. target_cf_name (str): A string containing the CF name of the forecast to be calibrated @@ -111,7 +111,7 @@ def process( forecast_df, truth_df, cube_inputs = LoadForTrainQRF( experiment=experiment, feature_config=feature_config, - target_diagnostic_name=target_diagnostic_name, + parquet_diagnostic_names=parquet_diagnostic_names, target_cf_name=target_cf_name, forecast_periods=forecast_periods, cycletime=cycletime, diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py index dfcbef2175..80948da017 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py @@ -120,9 +120,7 @@ def test_prepare_and_apply_qrf( result = PrepareAndApplyQRF( feature_config, "wind_speed_at_10m", - transformation=transformation, - pre_transform_addition=pre_transform_addition, - )(cube_inputs, qrf_model) + )(cube_inputs, (qrf_model, transformation, pre_transform_addition)) assert isinstance(result, Cube) assert result.data.shape == (len(quantiles), 2) @@ -207,42 +205,48 @@ def test_unexpected( plugin = PrepareAndApplyQRF( feature_config, "wind_speed_at_10m", - transformation=transformation, - pre_transform_addition=pre_transform_addition, ) if exception == "no_model_output": - result = plugin(cube_inputs, qrf_model=None) + result = plugin(cube_inputs, qrf_descriptors=None) assert isinstance(result, Cube) assert result.name() == "wind_speed_at_10m" assert result.units == "m s-1" assert result.data.shape == forecast_cube.data.shape assert np.allclose(result.data, forecast_cube.data) elif exception == "no_features": + qrf_descriptors = (qrf_model, transformation, pre_transform_addition) with pytest.raises(ValueError, match="No target forecast provided."): - plugin(CubeList(), qrf_model=qrf_model) + plugin(CubeList(), qrf_descriptors=qrf_descriptors) elif exception == "missing_target_feature": + qrf_descriptors = (qrf_model, transformation, pre_transform_addition) with pytest.raises(ValueError, match="No target forecast provided."): - plugin(CubeList([ancil_cube]), qrf_model=qrf_model) + plugin( + CubeList([ancil_cube]), + qrf_descriptors=qrf_descriptors, + ) elif exception == "missing_static_feature": + qrf_descriptors = (qrf_model, transformation, pre_transform_addition) feature_config = { "wind_speed_at_10m": ["mean", "std"], "distance_to_water": ["static"], } plugin.feature_config = feature_config with pytest.raises(ValueError, match="The number of cubes loaded."): - plugin(CubeList([forecast_cube]), qrf_model=qrf_model) + plugin(CubeList([forecast_cube]), qrf_descriptors=qrf_descriptors) elif exception == "missing_dynamic_feature": + qrf_descriptors = (qrf_model, transformation, pre_transform_addition) feature_config = { "wind_speed_at_10m": ["mean", "std"], "air_temperature": ["mean", "std"], } plugin.feature_config = feature_config with pytest.raises(ValueError, match="The number of cubes loaded."): - plugin(CubeList([forecast_cube]), qrf_model=qrf_model) + plugin(CubeList([forecast_cube]), qrf_descriptors=qrf_descriptors) elif exception == "no_quantile_forest_package": + qrf_descriptors = (qrf_model, transformation, pre_transform_addition) plugin.quantile_forest_installed = False - result = plugin(CubeList([forecast_cube]), qrf_model=qrf_model) + result = plugin(CubeList([forecast_cube]), qrf_descriptors=qrf_descriptors) assert isinstance(result, Cube) assert result.name() == "wind_speed_at_10m" assert result.units == "m s-1" diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py index 463bf76d3f..1fc33bd1b3 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py @@ -65,9 +65,15 @@ def _create_multi_site_forecast_parquet_file(tmp_path, representation="percentil wind_speed_dict["cf_name"] = "wind_speed" wind_speed_dict["units"] = "m s-1" wind_speed_dict["diagnostic"] = "wind_speed_at_10m" + wind_dir_dict = data_dict.copy() + wind_dir_dict["forecast"] = [90, 100, 110, 120, 130] + wind_dir_dict["cf_name"] = "wind_direction" + wind_dir_dict["units"] = "degrees" + wind_dir_dict["diagnostic"] = "wind_from_direction" data_df = pd.DataFrame(data_dict) wind_speed_df = pd.DataFrame(wind_speed_dict) - joined_df = pd.concat([data_df, wind_speed_df], ignore_index=True) + wind_dir_df = pd.DataFrame(wind_dir_dict) + joined_df = pd.concat([data_df, wind_speed_df, wind_dir_df], ignore_index=True) output_dir = tmp_path / "forecast_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) @@ -99,15 +105,27 @@ def _create_multi_percentile_forecast_parquet_file(tmp_path, representation=None "height": [1.5] * 5, "diagnostic": ["temperature_at_screen_level"] * 5, } + if representation == "realization": + data_dict["realization"] = list(range(len(data_dict["percentile"]))) + data_dict.pop("percentile") + elif representation == "kittens": + data_dict["kittens"] = list(range(len(data_dict["percentile"]))) + data_dict.pop("percentile") # Add wind speed to demonstrate filtering. wind_speed_dict = data_dict.copy() wind_speed_dict["forecast"] = [6, 10, 11, 12, 15] wind_speed_dict["cf_name"] = "wind_speed" wind_speed_dict["units"] = "m s-1" wind_speed_dict["diagnostic"] = "wind_speed_at_10m" + wind_dir_dict = data_dict.copy() + wind_dir_dict["forecast"] = [90, 100, 110, 120, 130] + wind_dir_dict["cf_name"] = "wind_direction" + wind_dir_dict["units"] = "degrees" + wind_dir_dict["diagnostic"] = "wind_from_direction" data_df = pd.DataFrame(data_dict) wind_speed_df = pd.DataFrame(wind_speed_dict) - joined_df = pd.concat([data_df, wind_speed_df], ignore_index=True) + wind_dir_df = pd.DataFrame(wind_dir_dict) + joined_df = pd.concat([data_df, wind_speed_df, wind_dir_df], ignore_index=True) output_dir = tmp_path / "forecast_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) @@ -147,15 +165,27 @@ def _create_multi_forecast_period_forecast_parquet_file(tmp_path, representation "height": [1.5] * 4, "diagnostic": ["temperature_at_screen_level"] * 4, } + if representation == "realization": + data_dict["realization"] = list(range(len(data_dict["percentile"]))) + data_dict.pop("percentile") + elif representation == "kittens": + data_dict["kittens"] = list(range(len(data_dict["percentile"]))) + data_dict.pop("percentile") # Add wind speed to demonstrate filtering. wind_speed_dict = data_dict.copy() wind_speed_dict["forecast"] = [6, 16, 12, 15] wind_speed_dict["cf_name"] = "wind_speed" wind_speed_dict["units"] = "m s-1" wind_speed_dict["diagnostic"] = "wind_speed_at_10m" + wind_dir_dict = data_dict.copy() + wind_dir_dict["forecast"] = [180, 190, 200, 210] + wind_dir_dict["cf_name"] = "wind_from_direction" + wind_dir_dict["units"] = "degrees" + wind_dir_dict["diagnostic"] = "wind_direction" data_df = pd.DataFrame(data_dict) wind_speed_df = pd.DataFrame(wind_speed_dict) - joined_df = pd.concat([data_df, wind_speed_df], ignore_index=True) + wind_dir_df = pd.DataFrame(wind_dir_dict) + joined_df = pd.concat([data_df, wind_speed_df, wind_dir_df], ignore_index=True) output_dir = tmp_path / "forecast_parquet_files" output_dir.mkdir(parents=True, exist_ok=True) @@ -304,123 +334,101 @@ def filter_forecast_periods(forecast_df, forecast_periods): def amend_expected_forecast_df( - forecast_df, forecast_periods, target_diagnostic_name, target_cf_name + forecast_df, forecast_periods, parquet_diagnostic_names, representation ): forecast_df = filter_forecast_periods(forecast_df, forecast_periods) - forecast_df = forecast_df[forecast_df["diagnostic"] == target_diagnostic_name] for column in ["time", "forecast_reference_time", "blend_time"]: forecast_df[column] = pd.to_datetime(forecast_df[column], unit="ns", utc=True) for column in ["forecast_period", "period"]: forecast_df[column] = pd.to_timedelta(forecast_df[column], unit="ns") - forecast_df = forecast_df.rename(columns={"forecast": target_cf_name}) + base_df = forecast_df[forecast_df["diagnostic"] == parquet_diagnostic_names[0]] + for parquet_diagnostic_name in parquet_diagnostic_names[1:]: + additional_df = forecast_df[ + forecast_df["diagnostic"] == parquet_diagnostic_name + ] + base_df = pd.merge( + base_df, + additional_df[ + [ + "wmo_id", + "forecast_reference_time", + "forecast_period", + representation, + "forecast", + ] + ].rename(columns={"forecast": parquet_diagnostic_name}), + on=["wmo_id", "forecast_reference_time", "forecast_period", representation], + how="left", + ) + forecast_df = base_df + + forecast_df.rename(columns={"forecast": "air_temperature"}, inplace=True) return forecast_df -def amend_expected_truth_df(truth_df, target_diagnostic_name): - truth_df = truth_df[truth_df["diagnostic"] == target_diagnostic_name] +def amend_expected_truth_df(truth_df, parquet_diagnostic_name): + truth_df = truth_df[truth_df["diagnostic"] == parquet_diagnostic_name] truth_df["time"] = pd.to_datetime(truth_df["time"], unit="ns", utc=True) return truth_df +@pytest.mark.parametrize("representation", ["percentile", "realization"]) +@pytest.mark.parametrize("include_dynamic", [True, False]) +@pytest.mark.parametrize("include_static", [True, False]) +@pytest.mark.parametrize("include_noncube_static", [True, False]) +@pytest.mark.parametrize("remove_target", [True, False]) @pytest.mark.parametrize( - "forecast_creation,truth_creation,forecast_periods,include_static,remove_target,representation", + "forecast_creation,truth_creation,forecast_periods", [ ( _create_multi_site_forecast_parquet_file, _create_multi_site_truth_parquet_file, "6:18:6", - False, - False, - "percentile", - ), - ( - _create_multi_site_forecast_parquet_file, - _create_multi_site_truth_parquet_file, - "6:18:6", - True, - False, - "percentile", ), ( _create_multi_percentile_forecast_parquet_file, _create_multi_percentile_truth_parquet_file, "6:18:6", - False, - False, - "percentile", - ), - ( - _create_multi_percentile_forecast_parquet_file, - _create_multi_percentile_truth_parquet_file, - "6:18:6", - True, - False, - "percentile", - ), - ( - _create_multi_forecast_period_forecast_parquet_file, - _create_multi_forecast_period_truth_parquet_file, - "6:18:6", - False, - False, - "percentile", ), ( _create_multi_forecast_period_forecast_parquet_file, _create_multi_forecast_period_truth_parquet_file, "6:18:6", - True, - True, # Remove target feature - "percentile", - ), - ( - _create_multi_site_forecast_parquet_file, - _create_multi_site_truth_parquet_file, - "6:18:6", - False, - False, - "realization", # Provide realization input ), ( _create_multi_forecast_period_forecast_parquet_file, _create_multi_forecast_period_truth_parquet_file, "12", - False, - False, - "percentile", ), ], ) def test_load_for_qrf( tmp_path, + representation, + include_dynamic, + include_static, + include_noncube_static, + remove_target, forecast_creation, truth_creation, forecast_periods, - include_static, - remove_target, - representation, ): """Test the LoadForTrainQRF plugin.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} + parquet_diagnostic_names = ["temperature_at_screen_level"] - forecast_path, expected_forecast_df, wmo_ids = forecast_creation( + forecast_path, base_expected_forecast_df, wmo_ids = forecast_creation( tmp_path, representation ) - expected_forecast_df = amend_expected_forecast_df( - expected_forecast_df, - forecast_periods, - "temperature_at_screen_level", - "air_temperature", - ) - truth_path, expected_truth_df = truth_creation(tmp_path) - expected_truth_df = amend_expected_truth_df( - expected_truth_df, "temperature_at_screen_level" - ) file_paths = [forecast_path, truth_path] + if include_dynamic: + feature_config["wind_speed_at_10m"] = ["mean", "std"] + parquet_diagnostic_names.append("wind_speed_at_10m") + if include_static: ancil_path, expected_cube = _create_ancil_file( tmp_path, sorted(list(set(wmo_ids))) @@ -428,14 +436,28 @@ def test_load_for_qrf( file_paths.append(ancil_path) feature_config["distance_to_water"] = ["static"] + if include_noncube_static: + feature_config["wind_from_direction"] = ["static"] + parquet_diagnostic_names.append("wind_from_direction") + if remove_target: feature_config.pop("air_temperature") + expected_forecast_df = amend_expected_forecast_df( + base_expected_forecast_df.copy(), + forecast_periods, + parquet_diagnostic_names, + representation, + ) + expected_truth_df = amend_expected_truth_df( + expected_truth_df, "temperature_at_screen_level" + ) + # Create an instance of LoadForTrainQRF with the required parameters plugin = LoadForTrainQRF( experiment="latestblend", feature_config=feature_config, - target_diagnostic_name="temperature_at_screen_level", + parquet_diagnostic_names=parquet_diagnostic_names, target_cf_name="air_temperature", forecast_periods=forecast_periods, cycletime="20170103T0000Z", @@ -479,7 +501,7 @@ def test_load_for_qrf_no_paths(tmp_path, make_files): plugin = LoadForTrainQRF( experiment="latestblend", feature_config=feature_config, - target_diagnostic_name="temperature_at_screen_level", + parquet_diagnostic_names=["temperature_at_screen_level"], target_cf_name="air_temperature", forecast_periods="6:12:6", cycletime="20170102T0000Z", @@ -523,8 +545,8 @@ def test_load_for_qrf_mismatches( expected_forecast_df = amend_expected_forecast_df( expected_forecast_df, forecast_periods, - "temperature_at_screen_level", - "air_temperature", + ["temperature_at_screen_level"], + "percentile", ) truth_path, expected_truth_df = truth_creation(tmp_path) @@ -537,7 +559,7 @@ def test_load_for_qrf_mismatches( plugin = LoadForTrainQRF( experiment="latestblend", feature_config=feature_config, - target_diagnostic_name="temperature_at_screen_level", + parquet_diagnostic_names=["temperature_at_screen_level"], target_cf_name="air_temperature", forecast_periods=forecast_periods, cycletime=cycletime, @@ -615,7 +637,7 @@ def test_unexpected( plugin = LoadForTrainQRF( experiment="latestblend", feature_config=feature_config, - target_diagnostic_name="temperature_at_screen_level", + parquet_diagnostic_names=["temperature_at_screen_level"], target_cf_name="air_temperature", forecast_periods=forecast_periods, cycletime="20170103T0000Z", @@ -655,92 +677,46 @@ def test_unexpected( raise ValueError(f"Unknown exception type: {exception}") +@pytest.mark.parametrize("representation", ["percentile", "realization"]) +@pytest.mark.parametrize("include_dynamic", [True, False]) +@pytest.mark.parametrize("include_static", [True, False]) +@pytest.mark.parametrize("include_noncube_static", [True, False]) +@pytest.mark.parametrize("remove_target", [True, False]) @pytest.mark.parametrize( - "forecast_creation,truth_creation,forecast_periods,include_static,remove_target,representation,expected", + "forecast_creation,truth_creation,forecast_periods", [ ( _create_multi_site_forecast_parquet_file, _create_multi_site_truth_parquet_file, "6:18:6", - False, - False, - "percentile", - 5.6, - ), - ( - _create_multi_site_forecast_parquet_file, - _create_multi_site_truth_parquet_file, - "6:18:6", - True, - False, - "percentile", - 5.64, - ), - ( - _create_multi_percentile_forecast_parquet_file, - _create_multi_percentile_truth_parquet_file, - "6:18:6", - False, - False, - "percentile", - 5.62, - ), - ( - _create_multi_percentile_forecast_parquet_file, - _create_multi_percentile_truth_parquet_file, - "6:18:6", - True, - False, - "percentile", - 5.62, - ), - ( - _create_multi_forecast_period_forecast_parquet_file, - _create_multi_forecast_period_truth_parquet_file, - "6:18:6", - False, - False, - "percentile", - 5.61, ), ( _create_multi_forecast_period_forecast_parquet_file, _create_multi_forecast_period_truth_parquet_file, "6:18:6", - True, - True, # Remove target feature - "percentile", - 5.62, ), ( _create_multi_site_forecast_parquet_file, _create_multi_site_truth_parquet_file, "6:18:6", - False, - False, - "realization", # Provide realization input - 5.62, ), ( _create_multi_forecast_period_forecast_parquet_file, _create_multi_forecast_period_truth_parquet_file, "12", - False, - False, - "percentile", - 5.64, ), ], ) def test_prepare_and_train_qrf( tmp_path, + representation, + include_dynamic, + include_static, + include_noncube_static, + remove_target, forecast_creation, truth_creation, forecast_periods, - include_static, - remove_target, - representation, - expected, ): """Test the PrepareAndTrainQRF plugin.""" feature_config = {"air_temperature": ["mean", "std", "altitude"]} @@ -751,21 +727,33 @@ def test_prepare_and_train_qrf( _, forecast_df, wmo_ids = forecast_creation(tmp_path, representation) forecast_df = amend_expected_forecast_df( - forecast_df, - forecast_periods, - "temperature_at_screen_level", - target_cf_name, + forecast_df, forecast_periods, ["temperature_at_screen_level"], representation ) _, truth_df = truth_creation(tmp_path) truth_df = amend_expected_truth_df(truth_df, "temperature_at_screen_level") + if include_dynamic: + forecast_df["wind_speed_at_10m"] = [10.0, 20.0, 15.0, 12.0, 11.0][ + : len(forecast_df) + ] + feature_config["wind_speed_at_10m"] = ["mean", "std"] + if include_static: _, ancil_cube = _create_ancil_file(tmp_path, sorted(list(set(wmo_ids)))) feature_config["distance_to_water"] = ["static"] + if include_noncube_static: + forecast_df["wind_from_direction"] = [90.0, 100.0, 110.0, 120.0, 130.0][ + : len(forecast_df) + ] + feature_config["wind_from_direction"] = ["static"] + if remove_target: feature_config.pop("air_temperature") + if feature_config == {}: + pytest.skip("No features to train on") + # Create an instance of PrepareAndTrainQRF with the required parameters plugin = PrepareAndTrainQRF( feature_config=feature_config, @@ -777,18 +765,30 @@ def test_prepare_and_train_qrf( pre_transform_addition=1, ) if include_static: - qrf_model = plugin(forecast_df, truth_df, iris.cube.CubeList([ancil_cube])) + qrf_model, transformation, pre_transform_addition = plugin( + forecast_df, truth_df, iris.cube.CubeList([ancil_cube]) + ) else: - qrf_model = plugin(forecast_df, truth_df) + qrf_model, transformation, pre_transform_addition = plugin( + forecast_df, truth_df + ) assert qrf_model.n_estimators == n_estimators assert qrf_model.max_depth == max_depth assert qrf_model.random_state == random_state + assert transformation == "log" + assert pre_transform_addition == 1 + + current_forecast = [5.64, 3, 55] if remove_target: current_forecast = [] - else: - current_forecast = [5.64, 3, 55] + + if include_dynamic: + current_forecast.extend([7, 1]) + + if include_noncube_static: + current_forecast.append(100) if include_static: current_forecast.append(2.5) @@ -796,4 +796,5 @@ def test_prepare_and_train_qrf( result = qrf_model.predict( np.expand_dims(np.array(current_forecast), 0), quantiles=[0.5] ) - np.testing.assert_almost_equal(result, expected, decimal=2) + expected = 5.6 + np.testing.assert_almost_equal(result, expected, decimal=1) diff --git a/improver_tests/calibration/test_init.py b/improver_tests/calibration/test_init.py index 49a7be7268..ef182a79b7 100644 --- a/improver_tests/calibration/test_init.py +++ b/improver_tests/calibration/test_init.py @@ -9,11 +9,13 @@ import iris import numpy as np +import pandas as pd import pytest from iris.cube import CubeList from improver.calibration import ( add_warning_comment, + get_training_period_cycles, split_forecasts_and_bias_files, split_forecasts_and_coeffs, split_forecasts_and_truth, @@ -613,5 +615,30 @@ def test_add_warning_to_comment(comment): assert result.attributes["comment"] == expected +@pytest.mark.parametrize( + "cycletime,forecast_period,training_length,expected", + [ + ("20171110T0000Z", 3600, 1, [pd.Timestamp(2017, 11, 9, 0, 0, tz="UTC")]), # noqa 1 hour + ("20171110T0000Z", 90000, 1, [pd.Timestamp(2017, 11, 8, 0, 0, tz="UTC")]), # noqa 25 hours + ("20171110T0000Z", 176400, 1, [pd.Timestamp(2017, 11, 7, 0, 0, tz="UTC")]), # noqa 49 hours + ( + "20171110T0000Z", + 3600, + 2, + [ + pd.Timestamp(2017, 11, 8, 0, 0, tz="UTC"), + pd.Timestamp(2017, 11, 9, 0, 0, tz="UTC"), + ], + ), # 1 hour, 2 days + ], +) +def test_get_training_period_cycles( + cycletime, forecast_period, training_length, expected +): + """Test that get_training_period_cycles returns the expected cycletimes.""" + result = get_training_period_cycles(cycletime, forecast_period, training_length) + assert list(result) == expected + + if __name__ == "__main__": unittest.main() From e549dac336b78b7ecee918ec4d116d2404d12f07 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Tue, 16 Sep 2025 16:06:00 +0100 Subject: [PATCH 05/15] Further updates. --- improver/calibration/dataframe_utilities.py | 6 +- ...apply_quantile_regression_random_forest.py | 5 +- ...train_quantile_regression_random_forest.py | 16 +- .../quantile_regression_random_forest.py | 16 +- ...apply_quantile_regression_random_forest.py | 69 +++++++-- ...train_quantile_regression_random_forest.py | 31 +++- .../test_quantile_regression_random_forest.py | 145 +++++++++++------- 7 files changed, 192 insertions(+), 96 deletions(-) diff --git a/improver/calibration/dataframe_utilities.py b/improver/calibration/dataframe_utilities.py index 857442390b..766f5aea75 100644 --- a/improver/calibration/dataframe_utilities.py +++ b/improver/calibration/dataframe_utilities.py @@ -128,7 +128,7 @@ def _unique_check(df: DataFrame, column: str) -> None: raise ValueError(msg) -def _quantile_check(df: DataFrame) -> None: +def quantile_check(df: DataFrame) -> None: """Check that the percentiles provided can be considered to be quantiles with equal spacing spanning the percentile range. @@ -142,7 +142,7 @@ def _quantile_check(df: DataFrame) -> None: if not np.allclose(expected_percentiles, df["percentile"].unique()): msg = ( - "The forecast percentiles can not be considered as quantiles. " + "Forecast percentiles must be equally spaced. " f"The forecast percentiles are {df['percentile'].unique()}." "Based on the number of percentiles provided, the expected " f"percentiles would be {expected_percentiles}." @@ -447,7 +447,7 @@ def _prepare_dataframes( # Check the percentiles can be considered to be equally space quantiles. if representation_type == "percentile": - _quantile_check(forecast_df) + quantile_check(forecast_df) # Remove forecast duplicates. forecast_cols = [ diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index 8a3c8f0371..19ad390d96 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -236,8 +236,12 @@ def process( # Descriptors expected: (qrf_model, transformation, pre_transform_addition) qrf_descriptors = (None, None, 0) qrf_model, transformation, pre_transform_addition = qrf_descriptors + cube_inputs, forecast_cube = self._get_inputs(cube_inputs, qrf_model=qrf_model) + if cube_inputs: + assert_spatial_coords_match(cube_inputs) + if not self.quantile_forest_installed: return forecast_cube if not qrf_model: @@ -249,7 +253,6 @@ def process( elif forecast_cube.coords("realization"): percentiles = self._compute_percentiles(forecast_cube.copy(), "realization") - assert_spatial_coords_match(cube_inputs) df = self._cube_to_dataframe(cube_inputs) calibrated_forecast = ApplyQuantileRegressionRandomForests( diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index ba9a9e3d4f..602694ef41 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -349,7 +349,7 @@ def _check_matching_times( ) ) - def _add_static_features_from_cube_to_df( + def _add_static_features_from_cubes_to_df( self, forecast_df: pd.DataFrame, cube_inputs: iris.cube.CubeList ) -> pd.DataFrame: """Add features to the forecast DataFrame from cubes based on the feature @@ -361,7 +361,7 @@ def _add_static_features_from_cube_to_df( cube_inputs: List of cubes containing additional features. Returns: - DataFrame with additional features added. + DataFrame with additional features added from the input cubes. """ if cube_inputs is None or len(cube_inputs) == 0: return forecast_df @@ -381,8 +381,8 @@ def _add_static_features_from_cube_to_df( ) return forecast_df - @staticmethod def filter_bad_sites( + self, forecast_df: pd.DataFrame, truth_df: pd.DataFrame, ) -> tuple[pd.DataFrame, pd.DataFrame]: @@ -397,9 +397,11 @@ def filter_bad_sites( - DataFrame containing the forecast data with bad sites removed. - DataFrame containing the truth data with bad sites removed. """ - truth_df.dropna( - subset=["latitude", "longitude", "altitude", "ob_value"], inplace=True - ) + truth_df.dropna(subset=["ob_value"], inplace=True) + + if truth_df.empty: + msg = "Empty truth DataFrame after removing NaNs." + raise ValueError(msg) wmo_ids = set(forecast_df["wmo_id"]).intersection(set(truth_df["wmo_id"])) @@ -434,7 +436,7 @@ def process( if len(intersecting_times) == 0: return None - forecast_df = self._add_static_features_from_cube_to_df( + forecast_df = self._add_static_features_from_cubes_to_df( forecast_df, cube_inputs ) forecast_df, truth_df = self.filter_bad_sites(forecast_df, truth_df) diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index 5768eccb12..5f6aa0182b 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -11,6 +11,7 @@ from improver import BasePlugin, PostProcessingPlugin from improver.constants import DAYS_IN_YEAR, HOURS_IN_DAY +from improver.calibration.dataframe_utilities import quantile_check try: from quantile_forest import RandomForestQuantileRegressor @@ -39,7 +40,9 @@ def prep_feature( day of year, sine of day of year, cosine of day of year, hour of day, sine of hour of day and cosine of hour of day. When computing the mean or standard deviation, these will be computed over either the percentile or realization column, - depending upon which is available. + depending upon which is available. When a percentile column is provided, the + expectation is that these percentiles are equally spaced between 0 and 100, so that + these percentiles can be treated as being equally likely. Args: df: Input DataFrame. @@ -59,6 +62,10 @@ def prep_feature( representation_name, variable_name, ] + + if representation_name == "percentile": + quantile_check(df) + # For a subset of the input DataFrame compute the mean or standard deviation # over the representation column, grouped by the groupby columns. if feature_name == "mean": @@ -336,6 +343,7 @@ def process( feature_column_names = get_required_column_names( forecast_df, self.feature_config ) + merge_columns = ["wmo_id", "time"] combined_df = forecast_df.merge( truth_df[merge_columns + ["ob_value"]], on=merge_columns, how="inner" @@ -356,7 +364,7 @@ def __init__( feature_config: dict[str, list[str]], quantiles: list[np.float32], transformation: str = None, - pre_transform_addition: np.float32 = 0, + pre_transform_addition: np.float32 = 0, ) -> None: """Initialise the plugin. @@ -384,7 +392,7 @@ def __init__( transformation (str): Transformation to be applied to the data before fitting. pre_transform_addition (float): - Value to be added before transformation. + Value to be added before transformation. Raises: ValueError: If the transformation is not one of the supported types. @@ -395,7 +403,7 @@ def __init__( self.quantiles = quantiles self.transformation = transformation _check_valid_transformation(self.transformation) - self.pre_transform_addition = pre_transform_addition + self.pre_transform_addition = pre_transform_addition def _reverse_transformation(self, forecast: np.ndarray) -> np.ndarray: """Reverse the transformation applied to the data prior to fitting the QRF. diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py index 80948da017..e12c89e08e 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py @@ -45,18 +45,26 @@ def _add_day_of_training_period_to_cube(cube, day_of_training_period, secondary_ @pytest.mark.parametrize("percentile_input", [True, False]) @pytest.mark.parametrize( - "n_estimators,max_depth,random_state,compression,transformation,pre_transform_addition,extra_kwargs,include_static,quantiles,expected", + "n_estimators,max_depth,random_state,transformation,pre_transform_addition,extra_kwargs,include_dynamic,include_static,include_nans,include_latlon_nans,quantiles,expected", [ - (2, 2, 55, 5, None, 0, {}, False, [0.5], [4.1, 5.65]), # noqa Basic test case - (100, 2, 55, 5, None, 0, {}, False, [1 / 3, 2 / 3], [[4.1, 5.1], [5.1, 5.1]]), # noqa Multiple quantiles - (1, 1, 55, 5, None, 0, {}, False, [0.5], [4.1, 6.2]), # noqa Fewer estimators and reduced depth - (1, 1, 73, 5, None, 0, {}, False, [0.5], [4.2, 6.2]), # Different random state - (2, 2, 55, 5, "log", 10, {}, False, [0.5], [4.1, 5.64]), # Log transformation - (2, 2, 55, 5, "log10", 10, {}, False, [0.5], [4.1, 5.64]), # noqa Log10 transformation - (2, 2, 55, 5, "sqrt", 10, {}, False, [0.5], [4.1, 5.64]), # noqa Square root transformation - (2, 2, 55, 5, "cbrt", 10, {}, False, [0.5], [4.1, 5.64]), # noqa Cube root transformation - (2, 2, 55, 5, None, 0, {"max_samples_leaf": 0.5}, False, [0.5], [4.1, 6.2]), # noqa # Different criterion - (2, 5, 55, 5, None, 0, {}, True, [0.5], [4.1, 5.65]), # noqa Include an additional static feature + (2, 2, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 5.65]), # noqa Basic test case + (100, 2, 55, None, 0, {}, False, False, False, False, [1 / 3, 2 / 3], [[4.1, 5.1], [5.1, 5.1]]), # noqa Multiple quantiles + (1, 1, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 6.2]), # noqa Fewer estimators and reduced depth + (1, 1, 73, None, 0, {}, False, False, False, False, [0.5], [4.2, 6.2]), # Different random state + (2, 2, 55, "log", 10, {}, False, False, False, False, [0.5], [4.1, 5.64]), # Log transformation + (2, 2, 55, "log10", 10, {}, False, False, False, False,[0.5], [4.1, 5.64]), # noqa Log10 transformation + (2, 2, 55, "sqrt", 10, {}, False, False, False, False,[0.5], [4.1, 5.64]), # noqa Square root transformation + (2, 2, 55, "cbrt", 10, {}, False, False, False, False,[0.5], [4.1, 5.64]), # noqa Cube root transformation + (2, 2, 55, None, 0, {"max_samples_leaf": 0.5}, False, False, False, False, [0.5], [4.1, 6.2]), # noqa # Different criterion + (2, 5, 55, None, 0, {}, True, False, False, False, [0.5], [4.1, 4.6]), # noqa Include an additional dynamic feature + (2, 5, 55, None, 0, {}, False, True, False, False, [0.5], [4.1, 5.65]), # noqa Include an additional static feature + (2, 5, 55, None, 0, {}, True, True, False, False, [0.5], [4.1, 4.6]), # noqa Include an additional dynamic and static feature + (2, 2, 55, None, 0, {}, False, False, True, False, [0.5], [4.1, 5.65]), # noqa NaNs in input data + (2, 2, 55, None, 0, {}, False, False, False, True, [0.5], [4.1, 5.65]), # noqa NaNs in lat/lon + (2, 2, 55, None, 0, {}, True, False, False, True, [0.5], [4.1, 4.6]), # noqa NaNs in lat/lon and dynamic feature + (2, 2, 55, None, 0, {}, False, True, False, True, [0.5], [4.1, 5.65]), # noqa NaNs in lat/lon and static feature + (2, 2, 55, None, 0, {}, True, True, False, True, [0.5], [4.1, 4.6]), # noqa NaNs in lat/lon, dynamic and static feature + (2, 2, 55, None, 0, {}, True, True, True, True, [0.5], [4.6, 4.6]), # noqa NaNs in lat/lon, dynamic and static feature and input data ], ) def test_prepare_and_apply_qrf( @@ -64,23 +72,29 @@ def test_prepare_and_apply_qrf( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, + include_dynamic, include_static, + include_nans, + include_latlon_nans, quantiles, expected, ): """Test the PrepareAndApplyQRF plugin.""" feature_config = {"wind_speed_at_10m": ["mean", "std", "latitude", "longitude"]} + if include_dynamic: + feature_config["air_temperature"] = ["mean", "std"] + if include_static: + feature_config["distance_to_water"] = ["static"] + qrf_model = _run_train_qrf( feature_config, n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, @@ -112,10 +126,37 @@ def test_prepare_and_apply_qrf( forecast_cube = RebadgeRealizationsAsPercentiles()(forecast_cube) cube_inputs.append(forecast_cube) + if include_dynamic: + dynamic_cube = _create_forecasts( + frt, + vt, + data + 0.5, # Slightly different data to the target feature + return_cube=True, + ) + dynamic_cube.rename("air_temperature") + dynamic_cube = _add_day_of_training_period_to_cube( + dynamic_cube, day_of_training_period, "forecast_reference_time" + ) + cube_inputs.append(dynamic_cube) + if include_static: ancil_cube = _create_ancil_file(return_cube=True) cube_inputs.append(ancil_cube) + if include_nans: + # Add some NaNs to the input data to check that they are handled + for cube in cube_inputs: + if cube.name() == "distance_to_water": + cube.data[0] = np.nan + else: + cube.data[0, 0] = np.nan + + if include_latlon_nans: + # Add some NaNs to the latitude and longitude to check that they are handled + for cube in cube_inputs: + cube.coord("latitude").points[1] = np.nan + cube.coord("longitude").points[1] = np.nan + # cube_inputs, qrf_model = LoadForApplyQRF()(file_paths) result = PrepareAndApplyQRF( feature_config, @@ -161,7 +202,6 @@ def test_unexpected( n_estimators = 2 max_depth = 2 random_state = 55 - compression = 5 transformation = None pre_transform_addition = 0 extra_kwargs = {} @@ -173,7 +213,6 @@ def test_unexpected( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py index 1fc33bd1b3..afd38d00d4 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py @@ -166,10 +166,10 @@ def _create_multi_forecast_period_forecast_parquet_file(tmp_path, representation "diagnostic": ["temperature_at_screen_level"] * 4, } if representation == "realization": - data_dict["realization"] = list(range(len(data_dict["percentile"]))) + data_dict["realization"] = [0, 1, 0, 1] data_dict.pop("percentile") elif representation == "kittens": - data_dict["kittens"] = list(range(len(data_dict["percentile"]))) + data_dict["kittens"] = [0, 1, 0, 1] data_dict.pop("percentile") # Add wind speed to demonstrate filtering. wind_speed_dict = data_dict.copy() @@ -682,6 +682,8 @@ def test_unexpected( @pytest.mark.parametrize("include_static", [True, False]) @pytest.mark.parametrize("include_noncube_static", [True, False]) @pytest.mark.parametrize("remove_target", [True, False]) +@pytest.mark.parametrize("include_nans", [True, False]) +@pytest.mark.parametrize("include_latlon_nans", [True, False]) @pytest.mark.parametrize( "forecast_creation,truth_creation,forecast_periods", [ @@ -691,13 +693,13 @@ def test_unexpected( "6:18:6", ), ( - _create_multi_forecast_period_forecast_parquet_file, - _create_multi_forecast_period_truth_parquet_file, + _create_multi_percentile_forecast_parquet_file, + _create_multi_percentile_truth_parquet_file, "6:18:6", ), ( - _create_multi_site_forecast_parquet_file, - _create_multi_site_truth_parquet_file, + _create_multi_forecast_period_forecast_parquet_file, + _create_multi_forecast_period_truth_parquet_file, "6:18:6", ), ( @@ -714,6 +716,8 @@ def test_prepare_and_train_qrf( include_static, include_noncube_static, remove_target, + include_nans, + include_latlon_nans, forecast_creation, truth_creation, forecast_periods, @@ -730,6 +734,7 @@ def test_prepare_and_train_qrf( forecast_df, forecast_periods, ["temperature_at_screen_level"], representation ) _, truth_df = truth_creation(tmp_path) + truth_df = amend_expected_truth_df(truth_df, "temperature_at_screen_level") if include_dynamic: @@ -751,6 +756,14 @@ def test_prepare_and_train_qrf( if remove_target: feature_config.pop("air_temperature") + if include_nans: + # Insert a NaN will result in this row being dropped. + truth_df.loc[0, "ob_value"] = pd.NA + + if include_latlon_nans: + # As latitude is not a feature, this NaN should be ignored. + truth_df.loc[1, "latitude"] = pd.NA + if feature_config == {}: pytest.skip("No features to train on") @@ -764,7 +777,11 @@ def test_prepare_and_train_qrf( transformation="log", pre_transform_addition=1, ) - if include_static: + if truth_df["ob_value"].isna().all(): + with pytest.raises(ValueError, match="Empty truth DataFrame"): + plugin(forecast_df, truth_df) + return + elif include_static: qrf_model, transformation, pre_transform_addition = plugin( forecast_df, truth_df, iris.cube.CubeList([ancil_cube]) ) diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py index f48d89cc3b..4a1b348624 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py @@ -42,6 +42,7 @@ def _create_forecasts( forecast_reference_time: str, validity_time: str, data: list[int], + representation: str = "realization", return_cube: bool = False, ) -> Cube | pd.DataFrame: """Create site forecast cube with realizations. @@ -68,6 +69,12 @@ def _create_forecasts( time=dt.strptime(validity_time, DT_FORMAT), frt=dt.strptime(forecast_reference_time, DT_FORMAT), ) + if representation == "percentile": + increment = (1 / (len(cube.coord("realization").points) + 1)) * 100 + cube.coord("realization").rename("percentile") + percentiles = np.array(np.arange(increment, 100, increment), dtype=np.float32) + cube.coord("percentile").points = percentiles + if return_cube: return cube df = as_data_frame( @@ -142,7 +149,6 @@ def _run_train_qrf( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, @@ -162,6 +168,7 @@ def _run_train_qrf( realization_data=[2, 6, 10], truth_data=[4.2, 3.8, 5.8, 6, 7, 7.3, 9.1, 9.5], tmp_path=None, + compression=5, ): realization_data = np.array(realization_data, dtype=np.float32) forecast_dfs = [] @@ -226,8 +233,9 @@ def test_quantile_forest_package_available(): assert result == expected +@pytest.mark.parametrize("representation", ["percentile", "realization"]) @pytest.mark.parametrize( - "feature,expected,expected_dtype", + "feature_name,expected,expected_dtype", [ ("mean", np.tile(np.array([6, 8], dtype=np.float32), 3), np.float32), ("std", np.repeat(4, 6).astype(np.float32), np.float32), @@ -257,17 +265,21 @@ def test_quantile_forest_package_available(): ("static", np.tile(np.array([2, 3], dtype=np.float32), 3), np.float32), ], ) -def test_prep_feature_single_time(feature, expected, expected_dtype): +def test_prep_feature_single_time( + representation, feature_name, expected, expected_dtype +): """Test the prep_feature function for a single time.""" - feature_name = "wind_speed_at_10m" + variable_name = "wind_speed_at_10m" forecast_reference_time = "20170101T0000Z" validity_time = "20170101T1200Z" data = np.array([2, 6, 10]) - forecast_df = _create_forecasts(forecast_reference_time, validity_time, data) + forecast_df = _create_forecasts( + forecast_reference_time, validity_time, data, representation + ) forecast_df = _add_day_of_training_period(forecast_df) - if feature == "static": + if feature_name == "static": feature_df = _create_ancil_file() forecast_df = forecast_df.merge( feature_df[["wmo_id", "distance_to_water"]], on=["wmo_id"], how="left" @@ -275,12 +287,12 @@ def test_prep_feature_single_time(feature, expected, expected_dtype): else: forecast_df = forecast_df.copy() - result = prep_feature(forecast_df, feature_name, feature) + result = prep_feature(forecast_df, variable_name, feature_name) - if feature in ["mean", "std"]: + if feature_name in ["mean", "std"]: assert result.shape == (6, 12) - feature_name_modified = f"{feature_name}_{feature}" - elif feature in [ + variable_name_modified = f"{variable_name}_{feature_name}" + elif feature_name in [ "day_of_year", "day_of_year_sin", "day_of_year_cos", @@ -289,20 +301,44 @@ def test_prep_feature_single_time(feature, expected, expected_dtype): "hour_of_day_cos", ]: assert result.shape == (6, 12) - feature_name_modified = feature - elif feature in ["static"]: + variable_name_modified = feature_name + elif feature_name in ["static"]: assert result.shape == (6, 12) - feature_name_modified = "distance_to_water" + variable_name_modified = "distance_to_water" else: assert result.shape == (6, 11) - feature_name_modified = feature + variable_name_modified = feature_name + + assert result[variable_name_modified].dtype == expected_dtype + np.testing.assert_allclose(result[variable_name_modified], expected, atol=1e-6) + + +@pytest.mark.parametrize("scenario", ["uneven_percentiles1", "uneven_percentiles2"]) +def test_prep_feature_invalid_percentiles(scenario): + """Test that an error is raised if invalid percentiles are provided.""" + variable_name = "wind_speed_at_10m" + + forecast_reference_time = "20170101T0000Z" + validity_time = "20170101T1200Z" + data = np.array([2, 6, 10]) + forecast_df = _create_forecasts( + forecast_reference_time, validity_time, data, representation="percentile" + ) + forecast_df = _add_day_of_training_period(forecast_df) + + if scenario == "uneven_percentiles1": + forecast_df.replace(to_replace={25: 10, 50: 20, 75: 20}, inplace=True) + elif scenario == "uneven_percentiles2": + forecast_df.replace(to_replace={25: 10, 75: 90}, inplace=True) - assert result[feature_name_modified].dtype == expected_dtype - np.testing.assert_allclose(result[feature_name_modified], expected, atol=1e-6) + with pytest.raises( + ValueError, match="Forecast percentiles must be equally spaced." + ): + prep_feature(forecast_df, variable_name, "mean") @pytest.mark.parametrize( - "feature,expected,expected_dtype", + "feature_name,expected,expected_dtype", [ ("mean", np.tile([6, 8], 18).astype(np.float32), np.float32), ("std", np.repeat(4, 36).astype(np.float32), np.float32), @@ -344,7 +380,7 @@ def test_prep_feature_single_time(feature, expected, expected_dtype): ("static", np.tile([2, 3], 18).astype(np.float32), np.float32), ], ) -def test_prep_feature_more_times(feature, expected, expected_dtype): +def test_prep_feature_more_times(feature_name, expected, expected_dtype): """Test the prep_feature function for multiple times.""" forecast_reference_times = [ "20170101T0000Z", @@ -365,7 +401,7 @@ def test_prep_feature_more_times(feature, expected, expected_dtype): data = np.array([2, 6, 10]) - feature_name = "wind_speed_at_10m" + variable_name = "wind_speed_at_10m" data = np.array([2, 6, 10]) @@ -375,7 +411,7 @@ def test_prep_feature_more_times(feature, expected, expected_dtype): forecast_df = pd.concat(forecast_dfs) forecast_df = _add_day_of_training_period(forecast_df) - if feature == "static": + if feature_name == "static": feature_df = _create_ancil_file() forecast_df = forecast_df.merge( feature_df[["wmo_id", "distance_to_water"]], on=["wmo_id"], how="left" @@ -384,12 +420,12 @@ def test_prep_feature_more_times(feature, expected, expected_dtype): else: forecast_df = forecast_df.copy() - result = prep_feature(forecast_df, feature_name, feature) + result = prep_feature(forecast_df, variable_name, feature_name) - if feature in ["mean", "std"]: + if feature_name in ["mean", "std"]: assert result.shape == (36, 12) - feature_name_modified = f"{feature_name}_{feature}" - elif feature in [ + variable_name_modified = f"{variable_name}_{feature_name}" + elif feature_name in [ "day_of_year", "day_of_year_sin", "day_of_year_cos", @@ -398,16 +434,16 @@ def test_prep_feature_more_times(feature, expected, expected_dtype): "hour_of_day_cos", ]: assert result.shape == (36, 12) - feature_name_modified = feature - elif feature in ["static"]: + variable_name_modified = feature_name + elif feature_name in ["static"]: assert result.shape == (36, 12) - feature_name_modified = "distance_to_water" + variable_name_modified = "distance_to_water" else: assert result.shape == (36, 11) - feature_name_modified = feature + variable_name_modified = feature_name - assert result[feature_name_modified].dtype == expected_dtype - np.testing.assert_allclose(result[feature_name_modified], expected, atol=1e-6) + assert result[variable_name_modified].dtype == expected_dtype + np.testing.assert_allclose(result[variable_name_modified], expected, atol=1e-6) @pytest.mark.parametrize( @@ -516,24 +552,23 @@ def test_check_valid_transformation(transformation): @pytest.mark.parametrize( - "n_estimators,max_depth,random_state,compression,transformation,pre_transform_addition,extra_kwargs,include_static,expected", + "n_estimators,max_depth,random_state,transformation,pre_transform_addition,extra_kwargs,include_static,expected", [ - (2, 2, 55, 5, None, 0, {}, False, 4.2), # Basic test case - (2, 2, 54, 5, None, 0, {}, False, 4.15), # Different random state - (1, 1, 54, 5, None, 0, {}, False, 4.2), # Fewer estimators and reduced depth - (2, 2, 55, 5, "log", 10, {}, False, 2.65), # Log transformation - (2, 2, 55, 5, "log10", 10, {}, False, 1.15), # Log10 transformation - (2, 2, 55, 5, "sqrt", 10, {}, False, 3.76), # Square root transformation - (2, 2, 55, 5, "cbrt", 10, {}, False, 2.42), # Cube root transformation - (2, 2, 55, 5, None, 0, {"criterion": "absolute_error"}, False, 4.2), # noqa # Different criterion - (2, 5, 55, 5, None, 0, {}, True, 4.2), # Include static data + (2, 2, 55, None, 0, {}, False, 4.2), # Basic test case + (2, 2, 54, None, 0, {}, False, 4.15), # Different random state + (1, 1, 54, None, 0, {}, False, 4.2), # Fewer estimators and reduced depth + (2, 2, 55, "log", 10, {}, False, 2.65), # Log transformation + (2, 2, 55, "log10", 10, {}, False, 1.15), # Log10 transformation + (2, 2, 55, "sqrt", 10, {}, False, 3.76), # Square root transformation + (2, 2, 55, "cbrt", 10, {}, False, 2.42), # Cube root transformation + (2, 2, 55, None, 0, {"criterion": "absolute_error"}, False, 4.2), # noqa # Different criterion + (2, 5, 55, None, 0, {}, True, 4.2), # Include static data ], ) def test_train_qrf_single_lead_times( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, @@ -550,7 +585,6 @@ def test_train_qrf_single_lead_times( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, @@ -581,24 +615,23 @@ def test_train_qrf_single_lead_times( @pytest.mark.parametrize( - "n_estimators,max_depth,random_state,compression,transformation,pre_transform_addition,extra_kwargs,include_static,expected", + "n_estimators,max_depth,random_state,transformation,pre_transform_addition,extra_kwargs,include_static,expected", [ - (2, 2, 55, 5, None, 0, {}, False, 5.8), # Basic test case - (1, 1, 55, 5, None, 0, {}, False, 3.8), # Fewer estimators and reduced depth - (1, 1, 73, 5, None, 0, {}, False, 5.8), # Different random state - (2, 2, 55, 5, "log", 10, {}, False, 2.752), # Log transformation - (2, 2, 55, 5, "log10", 10, {}, False, 1.195), # Log10 transformation - (2, 2, 55, 5, "sqrt", 10, {}, False, 3.964), # Square root transformation - (2, 2, 55, 5, "cbrt", 10, {}, False, 2.504), # Cube root transformation - (2, 2, 55, 5, None, 0, {"criterion": "absolute_error"}, False, 5.8), # noqa # Different criterion - (1, 1, 73, 5, None, 0, {}, True, 3.8), # Include static data + (2, 2, 55, None, 0, {}, False, 5.8), # Basic test case + (1, 1, 55, None, 0, {}, False, 3.8), # Fewer estimators and reduced depth + (1, 1, 73, None, 0, {}, False, 5.8), # Different random state + (2, 2, 55, "log", 10, {}, False, 2.752), # Log transformation + (2, 2, 55, "log10", 10, {}, False, 1.195), # Log10 transformation + (2, 2, 55, "sqrt", 10, {}, False, 3.964), # Square root transformation + (2, 2, 55, "cbrt", 10, {}, False, 2.504), # Cube root transformation + (2, 2, 55, None, 0, {"criterion": "absolute_error"}, False, 5.8), # noqa # Different criterion + (1, 1, 73, None, 0, {}, True, 3.8), # Include static data ], ) def test_train_qrf_multiple_lead_times( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, @@ -615,7 +648,6 @@ def test_train_qrf_multiple_lead_times( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, @@ -666,7 +698,6 @@ def test_alternative_feature_configs( n_estimators = 2 max_depth = 5 random_state = 55 - compression = 5 extra_kwargs = {} transformation = None pre_transform_addition = 0 @@ -678,7 +709,6 @@ def test_alternative_feature_configs( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, @@ -691,7 +721,6 @@ def test_alternative_feature_configs( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, @@ -756,7 +785,6 @@ def test_apply_qrf( n_estimators = 2 max_depth = 2 random_state = 55 - compression = 5 extra_kwargs = {} qrf_model = _run_train_qrf( @@ -764,7 +792,6 @@ def test_apply_qrf( n_estimators, max_depth, random_state, - compression, transformation, pre_transform_addition, extra_kwargs, From c7b769c7bd75b5abc3fcd7fb55af62f42d6ee5a0 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Tue, 16 Sep 2025 16:50:36 +0100 Subject: [PATCH 06/15] Create prep_features_from_config to consolidate checking of valid features from the config and the calling of prep_features to reduce for loops. --- .../quantile_regression_random_forest.py | 69 ++++++++++--------- .../test_quantile_regression_random_forest.py | 35 +++++++--- 2 files changed, 60 insertions(+), 44 deletions(-) diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index 5f6aa0182b..4976af92fd 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -152,36 +152,48 @@ def sanitise_forecast_dataframe( return df -def get_required_column_names( +def prep_features_from_config( df: pd.DataFrame, feature_config: dict[str, list[str]] -) -> list[str]: - """Process the feature_config to return the expected column names that will be - used as features with the QRF. +) -> tuple[pd.DataFrame, list[str]]: + """Process the feature_config to prepare the features as required and return the + expected column names that will be used as features with the QRF. Args: df: Input DataFrame. feature_config: Feature configuration defining the features to be used for QRF. Returns: - List of expected column names that will be used as features with the QRF. + Processed DataFrame and a list of expected column names that will be used as + features with the QRF. Raises: - ValueError: If a feature expected in the feature_config is not present in - the DataFrame. + ValueError: If a variable expected in the feature_config is not present in + the DataFrame e.g. "surface temperature". + ValueError: If a feature expected for a specific variable in the feature_config + is not supported e.g. "interquartile_range". """ feature_column_names = [] for variable_name in feature_config.keys(): - for feature in feature_config[variable_name]: - if feature in ["mean", "std"]: - feature_column_names.append(f"{variable_name}_{feature}") - elif feature in ["static"]: + if variable_name not in df.columns: + msg = f"Feature '{variable_name}' is not present in the forecast DataFrame." + raise ValueError(msg) + for feature_name in feature_config[variable_name]: + df = prep_feature(df, variable_name, feature_name) + + if feature_name in ["mean", "std"]: + feature_column_names.append(f"{variable_name}_{feature_name}") + elif feature_name in ["static"]: feature_column_names.append(variable_name) + elif feature_name in df.columns: + # For example, latitude, longitude, altitude, day_of_year, which + # are not tied to a specific variable. + feature_column_names.append(feature_name) else: - feature_column_names.append(feature) - - if len(list(set(feature_column_names) - set(df.columns))) > 0: - msg = f"Feature '{feature}' is not supported." - raise ValueError(msg) + msg = ( + f"Feature '{feature_name}' for variable " + f"'{variable_name}' is not supported." + ) + raise ValueError(msg) - return feature_column_names + return df, feature_column_names def _check_valid_transformation(transformation: str): @@ -329,21 +341,12 @@ def process( truth_df["ob_value"] + self.pre_transform_addition ) - for variable_name in self.feature_config.keys(): - if variable_name not in forecast_df.columns: - msg = ( - f"Feature '{variable_name}' is not present in the " - "forecast DataFrame." - ) - raise ValueError(msg) - for feature_name in self.feature_config[variable_name]: - forecast_df = prep_feature(forecast_df, variable_name, feature_name) - - forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) - feature_column_names = get_required_column_names( + forecast_df, feature_column_names = prep_features_from_config( forecast_df, self.feature_config ) + forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) + merge_columns = ["wmo_id", "time"] combined_df = forecast_df.merge( truth_df[merge_columns + ["ob_value"]], on=merge_columns, how="inner" @@ -453,14 +456,12 @@ def process( forecast_df[self.target_name] = getattr(np, self.transformation)( forecast_df[self.target_name] + self.pre_transform_addition ) + break - for feature_name in self.feature_config[variable_name]: - forecast_df = prep_feature(forecast_df, variable_name, feature_name) - - forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) - feature_column_names = get_required_column_names( + forecast_df, feature_column_names = prep_features_from_config( forecast_df, self.feature_config ) + forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) feature_values = np.array(forecast_df[feature_column_names]) diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py index 4a1b348624..a2c0b7c5f1 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py @@ -20,7 +20,7 @@ ApplyQuantileRegressionRandomForests, TrainQuantileRegressionRandomForests, _check_valid_transformation, - get_required_column_names, + prep_features_from_config, prep_feature, quantile_forest_package_available, sanitise_forecast_dataframe, @@ -501,24 +501,25 @@ def test_sanitise_forecast_dataframe(feature_config): "distance_to_water": ["static"], }, {"wind_speed_at_10m": ["latitude", "longitude", "height"]}, + {"wind_speed_at_10m": ["mean", "std"], "surface_temperature": ["mean"]}, ], ) -def test_get_required_column_names(feature_config): - """Test the get_required_column_names function.""" +def test_prep_features_from_config(feature_config): + """Test the prep_features_from_config function.""" data_dict = { "wmo_id": np.tile(WMO_ID, 3), "latitude": np.tile(LATITUDE, 3), "longitude": np.tile(LONGITUDE, 3), "altitude": np.tile(ALTITUDE, 3), - "wind_speed_at_10m_mean": np.repeat(5, 6), - "wind_speed_at_10m_std": np.repeat(1, 6), "wind_speed_at_10m": np.tile([4, 6], 3), "realization": np.tile([1, 2], 3), "distance_to_water": np.tile([2.0, 3.0], 3), + "blend_time": pd.Timestamp("2017-01-02 00:00:00", tz="utc"), + "forecast_period": 6 * 3.6e12, + "forecast_reference_time": pd.Timestamp("2017-01-02 00:00:00", tz="utc"), } df = pd.DataFrame(data_dict) - # expected = feature_config["wind_speed_at_10m"].copy() expected = list(itertools.chain.from_iterable(feature_config.values())) if "mean" in expected: expected = ["wind_speed_at_10m_mean" if e == "mean" else e for e in expected] @@ -528,11 +529,25 @@ def test_get_required_column_names(feature_config): expected = ["distance_to_water" if e == "static" else e for e in expected] if "height" in expected: - with pytest.raises(ValueError, match="Feature 'height' is not supported."): - get_required_column_names(df, feature_config) + with pytest.raises( + ValueError, + match=( + "Feature 'height' for variable 'wind_speed_at_10m' is not supported." + ), + ): + prep_features_from_config(df, feature_config) + elif "surface_temperature" in feature_config.keys(): + with pytest.raises( + ValueError, + match=( + "Feature 'surface_temperature' is not present " + "in the forecast DataFrame." + ), + ): + prep_features_from_config(df, feature_config) else: - result = get_required_column_names(df, feature_config) - assert result == expected + _, result_names = prep_features_from_config(df, feature_config) + assert result_names == expected @pytest.mark.parametrize( From 5d4cfc38acc2297d2a8870578ec1ebea93557691 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Tue, 16 Sep 2025 17:25:25 +0100 Subject: [PATCH 07/15] Minor edits. --- ...apply_quantile_regression_random_forest.py | 55 +++++-------------- .../quantile_regression_random_forest.py | 5 +- 2 files changed, 16 insertions(+), 44 deletions(-) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index 19ad390d96..a9ae2d5fd9 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -131,46 +131,21 @@ def _get_inputs( return cube_inputs, forecast_cube @staticmethod - def _compute_percentiles(forecast_cube: Cube, coord: str) -> list[float]: - """Compute the percentiles from the forecast cube. + def _compute_quantile_list(forecast_cube: Cube, coord: str) -> list[float]: + """Compute the list of quantiles e.g. 0.25, 0.5, 0.75 that will be produced + by using the forecast cube. Args: forecast_cube: Forecast to be calibrated. coord: Coordinate name. The length of the coordinate will be used to - determine the number of percentiles to compute. + determine the number of quantiles to compute. Returns: - List of percentiles computed from the forecast cube. + List of quantiles (e.g. 0.25, 0.5, 0.75) computed from the forecast cube. """ n_percentiles = len(forecast_cube.coord(coord).points) - percentiles = ( - np.array(choose_set_of_percentiles(n_percentiles)) / 100 - ).tolist() - return percentiles - - @staticmethod - def _percentiles_to_realizations(cube_inputs: CubeList) -> CubeList: - """Convert percentiles to realizations. The input forecasts are expected to - be percentiles but these percentiles are rebadged as realizations. - - Args: - cube_inputs: - List of cubes containing the features and the forecast to be calibrated. - Some may be percentiles. - Returns: - cube_inputs: - List of cubes with percentiles rebadged as realizations, - where appropriate - """ - - # Ensure there is a realization dimension on all cubes. This assumes a - # percentile dimension is present. - realization_cube_inputs = iris.cube.CubeList([]) - for feature_cube in cube_inputs: - if feature_cube.coords("percentile"): - feature_cube = RebadgePercentilesAsRealizations()(feature_cube) - realization_cube_inputs.append(feature_cube) - return realization_cube_inputs + quantiles = (np.array(choose_set_of_percentiles(n_percentiles)) / 100).tolist() + return quantiles @staticmethod def _cube_to_dataframe(cube_inputs: CubeList) -> pd.DataFrame: @@ -242,23 +217,23 @@ def process( if cube_inputs: assert_spatial_coords_match(cube_inputs) - if not self.quantile_forest_installed: - return forecast_cube - if not qrf_model: + if not self.quantile_forest_installed or not qrf_model: return forecast_cube template_forecast_cube = forecast_cube.copy() - if forecast_cube.coords("percentile"): - percentiles = self._compute_percentiles(forecast_cube.copy(), "percentile") - elif forecast_cube.coords("realization"): - percentiles = self._compute_percentiles(forecast_cube.copy(), "realization") + if forecast_cube.coords("realization"): + quantile_list = self._compute_quantile_list( + forecast_cube.copy(), "realization" + ) + elif forecast_cube.coords("percentile"): + quantile_list = forecast_cube.coord("percentile").points / 100.0 df = self._cube_to_dataframe(cube_inputs) calibrated_forecast = ApplyQuantileRegressionRandomForests( target_name=self.target_cf_name, feature_config=self.feature_config, - quantiles=percentiles, + quantiles=quantile_list, transformation=transformation, pre_transform_addition=pre_transform_addition, )(qrf_model, df) diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index 4976af92fd..51077c972e 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -397,9 +397,6 @@ def __init__( pre_transform_addition (float): Value to be added before transformation. - Raises: - ValueError: If the transformation is not one of the supported types. - """ self.target_name = target_name self.feature_config = feature_config @@ -468,7 +465,7 @@ def process( calibrated_forecast = qrf_model.predict( feature_values, quantiles=self.quantiles ) + calibrated_forecast = self._reverse_transformation(calibrated_forecast) calibrated_forecast = np.float32(calibrated_forecast) - calibrated_forecast = self._reverse_transformation(calibrated_forecast) return calibrated_forecast From 10cf0de593d130b799f664bed9505c933890970c Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Wed, 17 Sep 2025 12:21:21 +0100 Subject: [PATCH 08/15] Edits to support using a coordinate other than wmo_id as a site identifier for training and application. --- improver/calibration/__init__.py | 1 + ...apply_quantile_regression_random_forest.py | 19 ++++--- ...train_quantile_regression_random_forest.py | 30 ++++++++--- .../quantile_regression_random_forest.py | 25 ++++++--- ...apply_quantile_regression_random_forest.py | 14 +++-- ...train_quantile_regression_random_forest.py | 11 +++- ...apply_quantile_regression_random_forest.py | 5 +- ...train_quantile_regression_random_forest.py | 53 ++++++++++++++----- .../test_quantile_regression_random_forest.py | 41 +++++++++----- 9 files changed, 141 insertions(+), 58 deletions(-) diff --git a/improver/calibration/__init__.py b/improver/calibration/__init__.py index b498168829..2333a94cca 100644 --- a/improver/calibration/__init__.py +++ b/improver/calibration/__init__.py @@ -56,6 +56,7 @@ def __init__(self): ("altitude", pa.float32()), ("time", pa.timestamp("s", "utc")), ("wmo_id", pa.string()), + ("station_id", pa.string()), ("ob_value", pa.float32()), ] ) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index a9ae2d5fd9..0bbeea7357 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -20,9 +20,6 @@ ApplyQuantileRegressionRandomForests, quantile_forest_package_available, ) -from improver.ensemble_copula_coupling.ensemble_copula_coupling import ( - RebadgePercentilesAsRealizations, -) from improver.ensemble_copula_coupling.utilities import choose_set_of_percentiles from improver.utilities.cube_checker import assert_spatial_coords_match @@ -44,6 +41,7 @@ def __init__( self, feature_config: dict[str, list[str]], target_cf_name: str, + unique_site_id_key: str = "wmo_id", ): """Initialise the plugin. @@ -69,9 +67,14 @@ def __init__( A string containing the CF name of diagnostic to be calibrated. This will be used to separate it from the rest of the dynamic predictors, if present. + unique_site_id_key (str): + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. """ self.feature_config = feature_config self.target_cf_name = target_cf_name + self.unique_site_id_key = unique_site_id_key self.quantile_forest_installed = quantile_forest_package_available() def _get_inputs( @@ -147,8 +150,7 @@ def _compute_quantile_list(forecast_cube: Cube, coord: str) -> list[float]: quantiles = (np.array(choose_set_of_percentiles(n_percentiles)) / 100).tolist() return quantiles - @staticmethod - def _cube_to_dataframe(cube_inputs: CubeList) -> pd.DataFrame: + def _cube_to_dataframe(self, cube_inputs: CubeList) -> pd.DataFrame: """Convert cube inputs to a pandas DataFrame. Args: @@ -161,11 +163,13 @@ def _cube_to_dataframe(cube_inputs: CubeList) -> pd.DataFrame: # Convert the first cube to a DataFrame. df = as_data_frame(cube_inputs[0], add_aux_coords=True).reset_index() + + # Iteratively convert remaining cubes to DataFrame and merge. for cube in cube_inputs[1:]: temporary_df = as_data_frame(cube, add_aux_coords=True).reset_index() possible_columns = [ - "wmo_id", + self.unique_site_id_key, "time", "forecast_reference_time", "forecast_period", @@ -226,7 +230,7 @@ def process( forecast_cube.copy(), "realization" ) elif forecast_cube.coords("percentile"): - quantile_list = forecast_cube.coord("percentile").points / 100.0 + quantile_list = (forecast_cube.coord("percentile").points / 100.0).tolist() df = self._cube_to_dataframe(cube_inputs) @@ -236,6 +240,7 @@ def process( quantiles=quantile_list, transformation=transformation, pre_transform_addition=pre_transform_addition, + unique_site_id_key=self.unique_site_id_key, )(qrf_model, df) calibrated_forecast_cube = template_forecast_cube.copy( data=np.broadcast_to(calibrated_forecast.T, template_forecast_cube.shape) diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 602694ef41..868c2b1757 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -52,6 +52,7 @@ def __init__( cycletime: str, training_length: int, experiment: Optional[str] = None, + unique_site_id_key: str = "wmo_id", ): """Initialise the LoadForQRF plugin. @@ -72,6 +73,8 @@ def __init__( YYYYMMDDTHHMMZ. training_length: The number of days of training data to use. experiment: The name of the experiment (step) that calibration is applied to. + unique_site_id_key: The name of the column in the parquet files that + contains the unique site identifier e.g. wmo_id. """ self.quantile_forest_installed = quantile_forest_package_available() self.feature_config = feature_config @@ -81,6 +84,7 @@ def __init__( self.cycletime = cycletime self.training_length = training_length self.experiment = experiment + self.unique_site_id_key = unique_site_id_key def _parse_forecast_periods(self) -> list[int]: """Parse the forecast periods argument to produce a list of forecast periods @@ -200,7 +204,7 @@ def _read_parquet_files( base_df, additional_df[ [ - "wmo_id", + self.unique_site_id_key, "forecast_reference_time", "forecast_period", representation, @@ -208,7 +212,7 @@ def _read_parquet_files( ] ].rename(columns={"forecast": parquet_diagnostic_name}), on=[ - "wmo_id", + self.unique_site_id_key, "forecast_reference_time", "forecast_period", representation, @@ -261,8 +265,13 @@ def process( return None cube_inputs, parquets, _ = split_pickle_parquet_and_netcdf(file_paths) + # If there are no parquet files, return None. + if not parquets: + return None + forecast_table_path, truth_table_path = identify_parquet_type(parquets) + # If either the forecast or truth parquet files are missing, return None. if not forecast_table_path or not truth_table_path: return None @@ -302,6 +311,7 @@ def __init__( random_state: Optional[int] = None, transformation: Optional[str] = None, pre_transform_addition: float = 0, + unique_site_id_key: str = "wmo_id", ): """Initialise the PrepareAndTrainQRF plugin. @@ -317,6 +327,8 @@ def __init__( random_state: Seed used by the random number generator. transformation: Transformation to be applied to the data before fitting. pre_transform_addition: Value to be added before transformation. + unique_site_id_key: The name of the column in the parquet files that + contains the unique site identifier e.g. wmo_id. """ self.feature_config = feature_config self.target_cf_name = target_cf_name @@ -326,6 +338,7 @@ def __init__( self.random_state = random_state self.transformation = transformation self.pre_transform_addition = pre_transform_addition + self.unique_site_id_key = unique_site_id_key self.quantile_forest_installed = quantile_forest_package_available() @staticmethod @@ -377,7 +390,9 @@ def _add_static_features_from_cubes_to_df( feature_cube = cube_inputs.extract_cube(constr) feature_df = as_data_frame(feature_cube, add_aux_coords=True) forecast_df = forecast_df.merge( - feature_df[["wmo_id", feature_name]], on=["wmo_id"], how="left" + feature_df[[self.unique_site_id_key, feature_name]], + on=[self.unique_site_id_key], + how="left", ) return forecast_df @@ -403,10 +418,12 @@ def filter_bad_sites( msg = "Empty truth DataFrame after removing NaNs." raise ValueError(msg) - wmo_ids = set(forecast_df["wmo_id"]).intersection(set(truth_df["wmo_id"])) + site_ids = set(forecast_df[self.unique_site_id_key]).intersection( + set(truth_df[self.unique_site_id_key]) + ) - forecast_df = forecast_df[forecast_df["wmo_id"].isin(wmo_ids)] - truth_df = truth_df[truth_df["wmo_id"].isin(wmo_ids)] + forecast_df = forecast_df[forecast_df[self.unique_site_id_key].isin(site_ids)] + truth_df = truth_df[truth_df[self.unique_site_id_key].isin(site_ids)] return forecast_df, truth_df def process( @@ -450,6 +467,7 @@ def process( random_state=self.random_state, transformation=self.transformation, pre_transform_addition=self.pre_transform_addition, + unique_site_id_key=self.unique_site_id_key, )(forecast_df, truth_df) # Create a tuple that returns the model, transformation and diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index 51077c972e..ff9d2bc336 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -34,6 +34,7 @@ def prep_feature( df: pd.DataFrame, variable_name: str, feature_name: str, + unique_site_id_key: str = "wmo_id", ) -> pd.DataFrame: """Prepare features that require computation from the input DataFrame. Options available are mean and standard deviation of the input feature, the @@ -50,6 +51,8 @@ def prep_feature( feature_name: Feature to be computed. Options are "mean", "std", "day_of_year", "day_of_year_sin", "day_of_year_cos", "hour_of_day", "hour_of_day_sin" and "hour_of_day_cos". + unique_site_id_key: The name of the column in the DataFrame that + contains the unique site identifier e.g. wmo_id. Returns: df: DataFrame with the computed feature added. """ @@ -57,7 +60,7 @@ def prep_feature( representation_name = [ n for n in ["percentile", "realization"] if n in df.columns ][0] - groupby_cols = ["forecast_reference_time", "forecast_period", "wmo_id"] + groupby_cols = ["forecast_reference_time", "forecast_period", unique_site_id_key] subset_cols = [*groupby_cols] + [ representation_name, variable_name, @@ -153,7 +156,7 @@ def sanitise_forecast_dataframe( def prep_features_from_config( - df: pd.DataFrame, feature_config: dict[str, list[str]] + df: pd.DataFrame, feature_config: dict[str, list[str]], unique_site_id_key: str = "wmo_id" ) -> tuple[pd.DataFrame, list[str]]: """Process the feature_config to prepare the features as required and return the expected column names that will be used as features with the QRF. @@ -161,6 +164,8 @@ def prep_features_from_config( Args: df: Input DataFrame. feature_config: Feature configuration defining the features to be used for QRF. + unique_site_id_key: The name of the column in the DataFrame that + contains the unique site identifier e.g. wmo_id. Returns: Processed DataFrame and a list of expected column names that will be used as features with the QRF. @@ -176,7 +181,7 @@ def prep_features_from_config( msg = f"Feature '{variable_name}' is not present in the forecast DataFrame." raise ValueError(msg) for feature_name in feature_config[variable_name]: - df = prep_feature(df, variable_name, feature_name) + df = prep_feature(df, variable_name, feature_name, unique_site_id_key) if feature_name in ["mean", "std"]: feature_column_names.append(f"{variable_name}_{feature_name}") @@ -224,6 +229,7 @@ def __init__( random_state: Optional[int] = None, transformation: Optional[str] = None, pre_transform_addition: np.float32 = 0, + unique_site_id_key: str = "wmo_id", **kwargs, ) -> None: """Initialise the plugin. @@ -262,6 +268,8 @@ def __init__( Transformation to be applied to the data before fitting. pre_transform_addition (float): Value to be added before transformation. + unique_site_id_key: The name of the column in the DataFrame that + contains the unique site identifier e.g. wmo_id. kwargs: Additional keyword arguments for the quantile regression model. @@ -276,6 +284,7 @@ def __init__( self.transformation = transformation _check_valid_transformation(self.transformation) self.pre_transform_addition = pre_transform_addition + self.unique_site_id_key = unique_site_id_key self.kwargs = kwargs self.expected_coordinate_order = ["forecast_reference_time", "forecast_period"] @@ -342,12 +351,12 @@ def process( ) forecast_df, feature_column_names = prep_features_from_config( - forecast_df, self.feature_config + forecast_df, self.feature_config, unique_site_id_key=self.unique_site_id_key ) forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) - merge_columns = ["wmo_id", "time"] + merge_columns = [self.unique_site_id_key, "time"] combined_df = forecast_df.merge( truth_df[merge_columns + ["ob_value"]], on=merge_columns, how="inner" ) @@ -368,6 +377,7 @@ def __init__( quantiles: list[np.float32], transformation: str = None, pre_transform_addition: np.float32 = 0, + unique_site_id_key: str = "wmo_id", ) -> None: """Initialise the plugin. @@ -396,6 +406,8 @@ def __init__( Transformation to be applied to the data before fitting. pre_transform_addition (float): Value to be added before transformation. + unique_site_id_key: The name of the column in the DataFrame that + contains the unique site identifier e.g. wmo_id. """ self.target_name = target_name @@ -404,6 +416,7 @@ def __init__( self.transformation = transformation _check_valid_transformation(self.transformation) self.pre_transform_addition = pre_transform_addition + self.unique_site_id_key = unique_site_id_key def _reverse_transformation(self, forecast: np.ndarray) -> np.ndarray: """Reverse the transformation applied to the data prior to fitting the QRF. @@ -456,7 +469,7 @@ def process( break forecast_df, feature_column_names = prep_features_from_config( - forecast_df, self.feature_config + forecast_df, self.feature_config, self.unique_site_id_key ) forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) diff --git a/improver/cli/apply_quantile_regression_random_forest.py b/improver/cli/apply_quantile_regression_random_forest.py index a18c15f360..c442b0342b 100644 --- a/improver/cli/apply_quantile_regression_random_forest.py +++ b/improver/cli/apply_quantile_regression_random_forest.py @@ -14,8 +14,7 @@ def process( *file_paths: cli.inputpath, feature_config: cli.inputjson, target_cf_name: str, - transformation: str = None, - pre_transform_addition: float = 0, + unique_site_id_key: str = "wmo_id", ): """Applying the Quantile Regression Random Forest model. @@ -51,10 +50,10 @@ def process( A string containing the CF name of the forecast to be calibrated e.g. air_temperature. This will be used to separate it from the rest of the feature cubes, if present. - transformation (str): - Transformation to be applied to the data before fitting. - pre_transform_addition (float): - Value to be added before transformation. + unique_site_id_key (str): + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. Returns: iris.cube.Cube: The calibrated forecast cube. @@ -69,7 +68,6 @@ def process( result = PrepareAndApplyQRF( feature_config=feature_config, target_cf_name=target_cf_name, - transformation=transformation, - pre_transform_addition=pre_transform_addition, + unique_site_id_key=unique_site_id_key, )(cubes, qrf_model=qrf_model) return result diff --git a/improver/cli/train_quantile_regression_random_forest.py b/improver/cli/train_quantile_regression_random_forest.py index 87c28127a7..4802cdbb26 100644 --- a/improver/cli/train_quantile_regression_random_forest.py +++ b/improver/cli/train_quantile_regression_random_forest.py @@ -25,6 +25,7 @@ def process( random_state: int = None, transformation: str = None, pre_transform_addition: float = 0, + unique_site_id_key: str = "wmo_id", ): """Training a model using Quantile Regression Random Forest. @@ -60,8 +61,8 @@ def process( "distance_to_water": ["static"], } parquet_diagnostic_names (str): - A string containing the diagnostic name that will be used for filtering - the target diagnostic from the forecast and truth DataFrames read in + A string containing the diagnostic name that will be used for filtering + the target diagnostic from the forecast and truth DataFrames read in from the parquet files. This could be different from the CF name e.g. 'temperature_at_screen_level'. target_cf_name (str): @@ -99,6 +100,10 @@ def process( Transformation to be applied to the data before fitting. pre_transform_addition (float): Value to be added before transformation. + unique_site_id_key (str): + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. Returns: A quantile regression random forest model. """ @@ -116,6 +121,7 @@ def process( forecast_periods=forecast_periods, cycletime=cycletime, training_length=training_length, + unique_site_id_key=unique_site_id_key, )(file_paths) result = PrepareAndTrainQRF( feature_config=feature_config, @@ -126,6 +132,7 @@ def process( random_state=random_state, transformation=transformation, pre_transform_addition=pre_transform_addition, + unique_site_id_key=unique_site_id_key, )(forecast_df, truth_df, cube_inputs) return result diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py index e12c89e08e..3e1213f854 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py @@ -44,6 +44,7 @@ def _add_day_of_training_period_to_cube(cube, day_of_training_period, secondary_ @pytest.mark.parametrize("percentile_input", [True, False]) +@pytest.mark.parametrize("site_id", ["wmo_id", "station_id"]) @pytest.mark.parametrize( "n_estimators,max_depth,random_state,transformation,pre_transform_addition,extra_kwargs,include_dynamic,include_static,include_nans,include_latlon_nans,quantiles,expected", [ @@ -69,6 +70,7 @@ def _add_day_of_training_period_to_cube(cube, day_of_training_period, secondary_ ) def test_prepare_and_apply_qrf( percentile_input, + site_id, n_estimators, max_depth, random_state, @@ -109,6 +111,7 @@ def test_prepare_and_apply_qrf( ], realization_data=[2, 6, 10], truth_data=[4.2, 6.2, 4.1, 5.1], + site_id=site_id, ) frt = "20170103T0000Z" @@ -157,10 +160,10 @@ def test_prepare_and_apply_qrf( cube.coord("latitude").points[1] = np.nan cube.coord("longitude").points[1] = np.nan - # cube_inputs, qrf_model = LoadForApplyQRF()(file_paths) result = PrepareAndApplyQRF( feature_config, "wind_speed_at_10m", + unique_site_id_key=site_id, )(cube_inputs, (qrf_model, transformation, pre_transform_addition)) assert isinstance(result, Cube) diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py index afd38d00d4..7a3af15da6 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py @@ -21,7 +21,7 @@ ALTITUDE = [10, 20] LATITUDE = [50, 60] LONGITUDE = [0, 10] -WMO_ID = ["03001", "03002", "03003", "03004", "03005"] +SITE_ID = ["03001", "03002", "03003", "03004", "03005"] def _create_multi_site_forecast_parquet_file(tmp_path, representation="percentile"): @@ -203,6 +203,7 @@ def _create_multi_site_truth_parquet_file(tmp_path): "altitude": [10.0, 83.0, 56.0, 23.0, 2.0], "time": [pd.Timestamp("2017-01-02 06:00:00")] * 5, "wmo_id": ["03001", "03002", "03003", "03004", "03005"], + "station_id": ["03001", "03002", "03003", "03004", "03005"], "ob_value": [276.0, 270.0, 289.0, 290.0, 301.0], } wind_speed_dict = data_dict.copy() @@ -228,6 +229,7 @@ def _create_multi_percentile_truth_parquet_file(tmp_path): "altitude": [10.0], "time": [pd.Timestamp("2017-01-02 06:00:00")], "wmo_id": ["03001"], + "station_id": ["03001"], "ob_value": [276.0], } wind_speed_dict = data_dict.copy() @@ -256,6 +258,7 @@ def _create_multi_forecast_period_truth_parquet_file(tmp_path): 2, ), "wmo_id": ["03001", "03002", "03001", "03002"], + "station_id": ["03001", "03002", "03001", "03002"], "ob_value": [280, 273, 284, 275], } wind_speed_dict = data_dict.copy() @@ -272,7 +275,7 @@ def _create_multi_forecast_period_truth_parquet_file(tmp_path): return output_dir, joined_df -def _create_multi_site_truth_parquet_file_alt(tmp_path): +def _create_multi_site_truth_parquet_file_alt(tmp_path, site_id="wmo_id"): """Create a parquet file with multi-site truth data for wind speed.""" data_dict = { "diagnostic": ["wind_speed_at_10m"] * 5, @@ -281,6 +284,7 @@ def _create_multi_site_truth_parquet_file_alt(tmp_path): "altitude": [10.0, 83.0, 56.0, 23.0, 2.0], "time": [pd.Timestamp("2017-01-02 06:00:00")] * 5, "wmo_id": ["03001", "03002", "03003", "03004", "03005"], + "station_id": ["03001", "03002", "03003", "03004", "03005"], "ob_value": [10.0, 25.0, 4.0, 3.0, 11.0], } wind_speed_dict = data_dict.copy() @@ -297,16 +301,18 @@ def _create_multi_site_truth_parquet_file_alt(tmp_path): return output_dir, joined_df -def _create_ancil_file(tmp_path, wmo_ids): +def _create_ancil_file(tmp_path, site_ids): """Create an ancillary file for testing. Returns: An ancillary cube with a single value. """ - data = np.array(range(len(wmo_ids)), dtype=np.float32) + data = np.array(range(len(site_ids)), dtype=np.float32) template_cube = set_up_spot_variable_cube( data, - wmo_ids=wmo_ids, + wmo_ids=site_ids, + unique_site_id=site_ids, + unique_site_id_key="station_id", name="distance_to_water", units="m", ) @@ -334,7 +340,7 @@ def filter_forecast_periods(forecast_df, forecast_periods): def amend_expected_forecast_df( - forecast_df, forecast_periods, parquet_diagnostic_names, representation + forecast_df, forecast_periods, parquet_diagnostic_names, representation, site_id ): forecast_df = filter_forecast_periods(forecast_df, forecast_periods) for column in ["time", "forecast_reference_time", "blend_time"]: @@ -351,14 +357,19 @@ def amend_expected_forecast_df( base_df, additional_df[ [ - "wmo_id", + site_id, "forecast_reference_time", "forecast_period", representation, "forecast", ] ].rename(columns={"forecast": parquet_diagnostic_name}), - on=["wmo_id", "forecast_reference_time", "forecast_period", representation], + on=[ + site_id, + "forecast_reference_time", + "forecast_period", + representation, + ], how="left", ) forecast_df = base_df @@ -378,6 +389,7 @@ def amend_expected_truth_df(truth_df, parquet_diagnostic_name): @pytest.mark.parametrize("include_static", [True, False]) @pytest.mark.parametrize("include_noncube_static", [True, False]) @pytest.mark.parametrize("remove_target", [True, False]) +@pytest.mark.parametrize("site_id", ["wmo_id", "station_id"]) @pytest.mark.parametrize( "forecast_creation,truth_creation,forecast_periods", [ @@ -410,6 +422,7 @@ def test_load_for_qrf( include_static, include_noncube_static, remove_target, + site_id, forecast_creation, truth_creation, forecast_periods, @@ -418,7 +431,7 @@ def test_load_for_qrf( feature_config = {"air_temperature": ["mean", "std", "altitude"]} parquet_diagnostic_names = ["temperature_at_screen_level"] - forecast_path, base_expected_forecast_df, wmo_ids = forecast_creation( + forecast_path, base_expected_forecast_df, site_ids = forecast_creation( tmp_path, representation ) truth_path, expected_truth_df = truth_creation(tmp_path) @@ -431,7 +444,7 @@ def test_load_for_qrf( if include_static: ancil_path, expected_cube = _create_ancil_file( - tmp_path, sorted(list(set(wmo_ids))) + tmp_path, sorted(list(set(site_ids))) ) file_paths.append(ancil_path) feature_config["distance_to_water"] = ["static"] @@ -448,6 +461,7 @@ def test_load_for_qrf( forecast_periods, parquet_diagnostic_names, representation, + site_id, ) expected_truth_df = amend_expected_truth_df( expected_truth_df, "temperature_at_screen_level" @@ -462,6 +476,7 @@ def test_load_for_qrf( forecast_periods=forecast_periods, cycletime="20170103T0000Z", training_length=2, + unique_site_id_key=site_id, ) forecast_df, truth_df, cube_inputs = plugin(file_paths) @@ -484,7 +499,7 @@ def test_load_for_qrf( np.testing.assert_almost_equal(cube_inputs[0].data, expected_cube.data) -@pytest.mark.parametrize("make_files", [(False, True)]) +@pytest.mark.parametrize("make_files", [False, True]) def test_load_for_qrf_no_paths(tmp_path, make_files): """Test the LoadForTrainQRF plugin when the no valid file paths are provided. Either the paths do not exist, or the paths exist but the directories are empty.""" @@ -547,6 +562,7 @@ def test_load_for_qrf_mismatches( forecast_periods, ["temperature_at_screen_level"], "percentile", + "wmo_id", ) truth_path, expected_truth_df = truth_creation(tmp_path) @@ -684,6 +700,7 @@ def test_unexpected( @pytest.mark.parametrize("remove_target", [True, False]) @pytest.mark.parametrize("include_nans", [True, False]) @pytest.mark.parametrize("include_latlon_nans", [True, False]) +@pytest.mark.parametrize("site_id", ["wmo_id", "station_id"]) @pytest.mark.parametrize( "forecast_creation,truth_creation,forecast_periods", [ @@ -718,6 +735,7 @@ def test_prepare_and_train_qrf( remove_target, include_nans, include_latlon_nans, + site_id, forecast_creation, truth_creation, forecast_periods, @@ -729,9 +747,13 @@ def test_prepare_and_train_qrf( random_state = 46 target_cf_name = "air_temperature" - _, forecast_df, wmo_ids = forecast_creation(tmp_path, representation) + _, forecast_df, site_ids = forecast_creation(tmp_path, representation) forecast_df = amend_expected_forecast_df( - forecast_df, forecast_periods, ["temperature_at_screen_level"], representation + forecast_df, + forecast_periods, + ["temperature_at_screen_level"], + representation, + site_id, ) _, truth_df = truth_creation(tmp_path) @@ -744,7 +766,9 @@ def test_prepare_and_train_qrf( feature_config["wind_speed_at_10m"] = ["mean", "std"] if include_static: - _, ancil_cube = _create_ancil_file(tmp_path, sorted(list(set(wmo_ids)))) + _, ancil_cube = _create_ancil_file( + tmp_path, sorted(list(set(site_ids))) + ) feature_config["distance_to_water"] = ["static"] if include_noncube_static: @@ -776,6 +800,7 @@ def test_prepare_and_train_qrf( random_state=random_state, transformation="log", pre_transform_addition=1, + unique_site_id_key=site_id, ) if truth_df["ob_value"].isna().all(): with pytest.raises(ValueError, match="Empty truth DataFrame"): diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py index a2c0b7c5f1..7ab26f69e1 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py @@ -63,6 +63,8 @@ def _create_forecasts( name="wind_speed_at_10m", units="m s-1", wmo_ids=WMO_ID, + unique_site_id=WMO_ID, + unique_site_id_key="station_id", latitudes=np.array([50, 60], np.float32), longitudes=np.array([0, 10], np.float32), altitudes=np.array([10, 20], np.float32), @@ -122,6 +124,8 @@ def _create_ancil_file(return_cube=False): template_cube = set_up_spot_variable_cube( data, wmo_ids=WMO_ID, + unique_site_id=WMO_ID, + unique_site_id_key="station_id", latitudes=np.array([50, 60], np.float32), longitudes=np.array([0, 10], np.float32), altitudes=np.array([10, 20], np.float32), @@ -169,11 +173,14 @@ def _run_train_qrf( truth_data=[4.2, 3.8, 5.8, 6, 7, 7.3, 9.1, 9.5], tmp_path=None, compression=5, + site_id="wmo_id", ): realization_data = np.array(realization_data, dtype=np.float32) forecast_dfs = [] for index, (frt, vt) in enumerate(zip(forecast_reference_times, validity_times)): - forecast_df = _create_forecasts(frt, vt, realization_data + index) + forecast_df = _create_forecasts( + frt, vt, realization_data + index + ) forecast_dfs.append(forecast_df) forecast_df = pd.concat(forecast_dfs) forecast_df = _add_day_of_training_period(forecast_df) @@ -195,7 +202,7 @@ def _run_train_qrf( if include_static: ancil_df = _create_ancil_file() forecast_df = forecast_df.merge( - ancil_df[["wmo_id", "distance_to_water"]], on=["wmo_id"], how="left" + ancil_df[[site_id, "distance_to_water"]], on=[site_id], how="left" ) feature_config["distance_to_water"] = ["static"] if "air_temperature" in feature_config.keys(): @@ -211,6 +218,7 @@ def _run_train_qrf( random_state=random_state, transformation=transformation, pre_transform_addition=pre_transform_addition, + unique_site_id_key=site_id, **extra_kwargs, ) result = plugin.process(forecast_df, truth_df) @@ -290,7 +298,7 @@ def test_prep_feature_single_time( result = prep_feature(forecast_df, variable_name, feature_name) if feature_name in ["mean", "std"]: - assert result.shape == (6, 12) + assert result.shape == (6, 13) variable_name_modified = f"{variable_name}_{feature_name}" elif feature_name in [ "day_of_year", @@ -300,13 +308,13 @@ def test_prep_feature_single_time( "hour_of_day_sin", "hour_of_day_cos", ]: - assert result.shape == (6, 12) + assert result.shape == (6, 13) variable_name_modified = feature_name elif feature_name in ["static"]: - assert result.shape == (6, 12) + assert result.shape == (6, 13) variable_name_modified = "distance_to_water" else: - assert result.shape == (6, 11) + assert result.shape == (6, 12) variable_name_modified = feature_name assert result[variable_name_modified].dtype == expected_dtype @@ -423,7 +431,7 @@ def test_prep_feature_more_times(feature_name, expected, expected_dtype): result = prep_feature(forecast_df, variable_name, feature_name) if feature_name in ["mean", "std"]: - assert result.shape == (36, 12) + assert result.shape == (36, 13) variable_name_modified = f"{variable_name}_{feature_name}" elif feature_name in [ "day_of_year", @@ -433,13 +441,13 @@ def test_prep_feature_more_times(feature_name, expected, expected_dtype): "hour_of_day_sin", "hour_of_day_cos", ]: - assert result.shape == (36, 12) + assert result.shape == (36, 13) variable_name_modified = feature_name elif feature_name in ["static"]: - assert result.shape == (36, 12) + assert result.shape == (36, 13) variable_name_modified = "distance_to_water" else: - assert result.shape == (36, 11) + assert result.shape == (36, 12) variable_name_modified = feature_name assert result[variable_name_modified].dtype == expected_dtype @@ -683,21 +691,24 @@ def test_train_qrf_multiple_lead_times( @pytest.mark.parametrize( - "feature_config,data,include_static,expected", + "feature_config,data,include_static,site_id,expected", [ - ({"wind_speed_at_10m": ["mean"]}, [5], False, [5]), # One feature - ({"wind_speed_at_10m": ["latitude"]}, [61], False, [7.75]), # noqa Without the target - ({"wind_speed_at_10m": ["mean"]}, [5], True, [4]), # With static data + ({"wind_speed_at_10m": ["mean"]}, [5], False, "wmo_id", [5]), # One feature + ({"wind_speed_at_10m": ["mean"]}, [5], False, "station_id", [5]), # One feature + ({"wind_speed_at_10m": ["latitude"]}, [61], False, "wmo_id", [7.75]), # noqa Without the target + ({"wind_speed_at_10m": ["mean"]}, [5], True, "wmo_id", [4]), # With static data ( {"wind_speed_at_10m": ["mean"], "air_temperature": ["mean"]}, [5], False, + "wmo_id", [5], ), # Multiple dynamic features ( {"wind_speed_at_10m": ["mean"], "pressure_at_mean_sea_level": ["mean"]}, [5], False, + "wmo_id", "Feature 'pressure_at_mean_sea_level' is not present", ), # Multiple dynamic features ], @@ -706,6 +717,7 @@ def test_alternative_feature_configs( feature_config, data, include_static, + site_id, expected, ): """Test the TrainQuantileRegressionRandomForests plugin for a few different @@ -740,6 +752,7 @@ def test_alternative_feature_configs( pre_transform_addition, extra_kwargs, include_static, + site_id=site_id, ) assert qrf_model.n_estimators == n_estimators From 14e1a039a54bc96df0fa0d61c912bc9cc2886744 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Wed, 17 Sep 2025 16:18:56 +0100 Subject: [PATCH 09/15] Edits to support matching sites based on e.g. latitude and longitude. --- ...apply_quantile_regression_random_forest.py | 27 ++- ...train_quantile_regression_random_forest.py | 33 ++-- .../quantile_regression_random_forest.py | 54 ++++-- ...apply_quantile_regression_random_forest.py | 175 +++++++++++++++--- ...train_quantile_regression_random_forest.py | 24 ++- .../test_quantile_regression_random_forest.py | 8 +- 6 files changed, 235 insertions(+), 86 deletions(-) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index 0bbeea7357..3047d3d2a3 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -41,7 +41,7 @@ def __init__( self, feature_config: dict[str, list[str]], target_cf_name: str, - unique_site_id_key: str = "wmo_id", + unique_site_id_keys: list[str] = ["wmo_id"], ): """Initialise the plugin. @@ -67,14 +67,14 @@ def __init__( A string containing the CF name of diagnostic to be calibrated. This will be used to separate it from the rest of the dynamic predictors, if present. - unique_site_id_key (str): - If working with spot data and available, the name of the coordinate - in the input cubes that contains unique site IDs, e.g. "wmo_id" if - all sites have a valid wmo_id. + unique_site_id_keys (list): + If working with spot data and available, a list of the names of the + coordinates in the input cubes that are to be used to uniquely identify + a site. For example, ["wmo_id"] or ["latitude", "longitude"]. """ self.feature_config = feature_config self.target_cf_name = target_cf_name - self.unique_site_id_key = unique_site_id_key + self.unique_site_id_keys = unique_site_id_keys self.quantile_forest_installed = quantile_forest_package_available() def _get_inputs( @@ -163,17 +163,16 @@ def _cube_to_dataframe(self, cube_inputs: CubeList) -> pd.DataFrame: # Convert the first cube to a DataFrame. df = as_data_frame(cube_inputs[0], add_aux_coords=True).reset_index() - + possible_columns = [ + *self.unique_site_id_keys, + "time", + "forecast_reference_time", + "forecast_period", + ] # Iteratively convert remaining cubes to DataFrame and merge. for cube in cube_inputs[1:]: temporary_df = as_data_frame(cube, add_aux_coords=True).reset_index() - possible_columns = [ - self.unique_site_id_key, - "time", - "forecast_reference_time", - "forecast_period", - ] merge_columns = [ col for col in possible_columns if col in temporary_df.columns ] @@ -240,7 +239,7 @@ def process( quantiles=quantile_list, transformation=transformation, pre_transform_addition=pre_transform_addition, - unique_site_id_key=self.unique_site_id_key, + unique_site_id_keys=self.unique_site_id_keys, )(qrf_model, df) calibrated_forecast_cube = template_forecast_cube.copy( data=np.broadcast_to(calibrated_forecast.T, template_forecast_cube.shape) diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 868c2b1757..cd90df9192 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -7,7 +7,7 @@ import pathlib from pathlib import Path -from typing import Optional +from typing import Optional, Union import iris import numpy as np @@ -52,7 +52,7 @@ def __init__( cycletime: str, training_length: int, experiment: Optional[str] = None, - unique_site_id_key: str = "wmo_id", + unique_site_id_keys: Union[list[str], str] = "wmo_id", ): """Initialise the LoadForQRF plugin. @@ -84,7 +84,9 @@ def __init__( self.cycletime = cycletime self.training_length = training_length self.experiment = experiment - self.unique_site_id_key = unique_site_id_key + if isinstance(unique_site_id_keys, str): + unique_site_id_keys = [unique_site_id_keys] + self.unique_site_id_keys = unique_site_id_keys def _parse_forecast_periods(self) -> list[int]: """Parse the forecast periods argument to produce a list of forecast periods @@ -204,7 +206,7 @@ def _read_parquet_files( base_df, additional_df[ [ - self.unique_site_id_key, + *self.unique_site_id_keys, "forecast_reference_time", "forecast_period", representation, @@ -212,7 +214,7 @@ def _read_parquet_files( ] ].rename(columns={"forecast": parquet_diagnostic_name}), on=[ - self.unique_site_id_key, + *self.unique_site_id_keys, "forecast_reference_time", "forecast_period", representation, @@ -311,7 +313,7 @@ def __init__( random_state: Optional[int] = None, transformation: Optional[str] = None, pre_transform_addition: float = 0, - unique_site_id_key: str = "wmo_id", + unique_site_id_keys: Union[list[str], str] = "wmo_id", ): """Initialise the PrepareAndTrainQRF plugin. @@ -338,7 +340,9 @@ def __init__( self.random_state = random_state self.transformation = transformation self.pre_transform_addition = pre_transform_addition - self.unique_site_id_key = unique_site_id_key + if isinstance(unique_site_id_keys, str): + unique_site_id_keys = [unique_site_id_keys] + self.unique_site_id_keys = unique_site_id_keys self.quantile_forest_installed = quantile_forest_package_available() @staticmethod @@ -390,8 +394,8 @@ def _add_static_features_from_cubes_to_df( feature_cube = cube_inputs.extract_cube(constr) feature_df = as_data_frame(feature_cube, add_aux_coords=True) forecast_df = forecast_df.merge( - feature_df[[self.unique_site_id_key, feature_name]], - on=[self.unique_site_id_key], + feature_df[[*self.unique_site_id_keys, feature_name]], + on=[*self.unique_site_id_keys], how="left", ) return forecast_df @@ -418,12 +422,11 @@ def filter_bad_sites( msg = "Empty truth DataFrame after removing NaNs." raise ValueError(msg) - site_ids = set(forecast_df[self.unique_site_id_key]).intersection( - set(truth_df[self.unique_site_id_key]) - ) + forecast_index = forecast_df.set_index([*self.unique_site_id_keys]).index + truth_index = truth_df.set_index([*self.unique_site_id_keys]).index + forecast_df = forecast_df[forecast_index.isin(truth_index)] + truth_df = truth_df[truth_index.isin(forecast_index)] - forecast_df = forecast_df[forecast_df[self.unique_site_id_key].isin(site_ids)] - truth_df = truth_df[truth_df[self.unique_site_id_key].isin(site_ids)] return forecast_df, truth_df def process( @@ -467,7 +470,7 @@ def process( random_state=self.random_state, transformation=self.transformation, pre_transform_addition=self.pre_transform_addition, - unique_site_id_key=self.unique_site_id_key, + unique_site_id_keys=self.unique_site_id_keys, )(forecast_df, truth_df) # Create a tuple that returns the model, transformation and diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index ff9d2bc336..8014ce9632 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -4,14 +4,14 @@ # See LICENSE in the root of the repository for full licensing details. """Plugins to perform quantile regression using random forests.""" -from typing import Optional +from typing import Optional, Union import numpy as np import pandas as pd from improver import BasePlugin, PostProcessingPlugin -from improver.constants import DAYS_IN_YEAR, HOURS_IN_DAY from improver.calibration.dataframe_utilities import quantile_check +from improver.constants import DAYS_IN_YEAR, HOURS_IN_DAY try: from quantile_forest import RandomForestQuantileRegressor @@ -34,7 +34,7 @@ def prep_feature( df: pd.DataFrame, variable_name: str, feature_name: str, - unique_site_id_key: str = "wmo_id", + unique_site_id_keys: Union[list[str], str] = "wmo_id", ) -> pd.DataFrame: """Prepare features that require computation from the input DataFrame. Options available are mean and standard deviation of the input feature, the @@ -51,16 +51,22 @@ def prep_feature( feature_name: Feature to be computed. Options are "mean", "std", "day_of_year", "day_of_year_sin", "day_of_year_cos", "hour_of_day", "hour_of_day_sin" and "hour_of_day_cos". - unique_site_id_key: The name of the column in the DataFrame that + unique_site_id_keys: The name of the column in the DataFrame that contains the unique site identifier e.g. wmo_id. Returns: df: DataFrame with the computed feature added. """ if feature_name in ["mean", "std"]: + if isinstance(unique_site_id_keys, str): + unique_site_id_keys = [unique_site_id_keys] representation_name = [ n for n in ["percentile", "realization"] if n in df.columns ][0] - groupby_cols = ["forecast_reference_time", "forecast_period", unique_site_id_key] + groupby_cols = [ + "forecast_reference_time", + "forecast_period", + *unique_site_id_keys, + ] subset_cols = [*groupby_cols] + [ representation_name, variable_name, @@ -156,7 +162,9 @@ def sanitise_forecast_dataframe( def prep_features_from_config( - df: pd.DataFrame, feature_config: dict[str, list[str]], unique_site_id_key: str = "wmo_id" + df: pd.DataFrame, + feature_config: dict[str, list[str]], + unique_site_id_keys: Union[list[str], str] = "wmo_id", ) -> tuple[pd.DataFrame, list[str]]: """Process the feature_config to prepare the features as required and return the expected column names that will be used as features with the QRF. @@ -164,7 +172,7 @@ def prep_features_from_config( Args: df: Input DataFrame. feature_config: Feature configuration defining the features to be used for QRF. - unique_site_id_key: The name of the column in the DataFrame that + unique_site_id_keys: The name of the column in the DataFrame that contains the unique site identifier e.g. wmo_id. Returns: Processed DataFrame and a list of expected column names that will be used as @@ -175,13 +183,16 @@ def prep_features_from_config( ValueError: If a feature expected for a specific variable in the feature_config is not supported e.g. "interquartile_range". """ + if isinstance(unique_site_id_keys, str): + unique_site_id_keys = [unique_site_id_keys] + feature_column_names = [] for variable_name in feature_config.keys(): if variable_name not in df.columns: msg = f"Feature '{variable_name}' is not present in the forecast DataFrame." raise ValueError(msg) for feature_name in feature_config[variable_name]: - df = prep_feature(df, variable_name, feature_name, unique_site_id_key) + df = prep_feature(df, variable_name, feature_name, unique_site_id_keys) if feature_name in ["mean", "std"]: feature_column_names.append(f"{variable_name}_{feature_name}") @@ -229,7 +240,7 @@ def __init__( random_state: Optional[int] = None, transformation: Optional[str] = None, pre_transform_addition: np.float32 = 0, - unique_site_id_key: str = "wmo_id", + unique_site_id_keys: Union[list[str], str] = "wmo_id", **kwargs, ) -> None: """Initialise the plugin. @@ -268,7 +279,7 @@ def __init__( Transformation to be applied to the data before fitting. pre_transform_addition (float): Value to be added before transformation. - unique_site_id_key: The name of the column in the DataFrame that + unique_site_id_keys: The name of the column in the DataFrame that contains the unique site identifier e.g. wmo_id. kwargs: Additional keyword arguments for the quantile regression model. @@ -284,7 +295,9 @@ def __init__( self.transformation = transformation _check_valid_transformation(self.transformation) self.pre_transform_addition = pre_transform_addition - self.unique_site_id_key = unique_site_id_key + if isinstance(unique_site_id_keys, str): + unique_site_id_keys = [unique_site_id_keys] + self.unique_site_id_keys = unique_site_id_keys self.kwargs = kwargs self.expected_coordinate_order = ["forecast_reference_time", "forecast_period"] @@ -351,12 +364,13 @@ def process( ) forecast_df, feature_column_names = prep_features_from_config( - forecast_df, self.feature_config, unique_site_id_key=self.unique_site_id_key + forecast_df, + self.feature_config, + unique_site_id_keys=self.unique_site_id_keys, ) - forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) - merge_columns = [self.unique_site_id_key, "time"] + merge_columns = [*self.unique_site_id_keys, "time"] combined_df = forecast_df.merge( truth_df[merge_columns + ["ob_value"]], on=merge_columns, how="inner" ) @@ -377,7 +391,7 @@ def __init__( quantiles: list[np.float32], transformation: str = None, pre_transform_addition: np.float32 = 0, - unique_site_id_key: str = "wmo_id", + unique_site_id_keys: list[str] = ["wmo_id"], ) -> None: """Initialise the plugin. @@ -406,8 +420,8 @@ def __init__( Transformation to be applied to the data before fitting. pre_transform_addition (float): Value to be added before transformation. - unique_site_id_key: The name of the column in the DataFrame that - contains the unique site identifier e.g. wmo_id. + unique_site_id_keys: The names of the column in the DataFrame that + uniquely identify a site e.g. ["wmo_id"] or ["latitude", "longitude"]. """ self.target_name = target_name @@ -416,7 +430,7 @@ def __init__( self.transformation = transformation _check_valid_transformation(self.transformation) self.pre_transform_addition = pre_transform_addition - self.unique_site_id_key = unique_site_id_key + self.unique_site_id_keys = unique_site_id_keys def _reverse_transformation(self, forecast: np.ndarray) -> np.ndarray: """Reverse the transformation applied to the data prior to fitting the QRF. @@ -469,7 +483,9 @@ def process( break forecast_df, feature_column_names = prep_features_from_config( - forecast_df, self.feature_config, self.unique_site_id_key + forecast_df, + self.feature_config, + unique_site_id_keys=self.unique_site_id_keys, ) forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py index 3e1213f854..cacbd00e3a 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py @@ -43,29 +43,143 @@ def _add_day_of_training_period_to_cube(cube, day_of_training_period, secondary_ return cube -@pytest.mark.parametrize("percentile_input", [True, False]) -@pytest.mark.parametrize("site_id", ["wmo_id", "station_id"]) +@pytest.mark.parametrize("percentile_input", [True]) @pytest.mark.parametrize( - "n_estimators,max_depth,random_state,transformation,pre_transform_addition,extra_kwargs,include_dynamic,include_static,include_nans,include_latlon_nans,quantiles,expected", + "site_id", [["wmo_id"], ["station_id"], ["latitude", "longitude", "altitude"]] +) +@pytest.mark.parametrize( + "n_estimators,max_depth,random_state,transformation,pre_transform_addition,extra_kwargs,include_dynamic,include_static,include_nans,include_latlon_nans,quantiles,expected,expected2", [ - (2, 2, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 5.65]), # noqa Basic test case - (100, 2, 55, None, 0, {}, False, False, False, False, [1 / 3, 2 / 3], [[4.1, 5.1], [5.1, 5.1]]), # noqa Multiple quantiles - (1, 1, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 6.2]), # noqa Fewer estimators and reduced depth - (1, 1, 73, None, 0, {}, False, False, False, False, [0.5], [4.2, 6.2]), # Different random state - (2, 2, 55, "log", 10, {}, False, False, False, False, [0.5], [4.1, 5.64]), # Log transformation - (2, 2, 55, "log10", 10, {}, False, False, False, False,[0.5], [4.1, 5.64]), # noqa Log10 transformation - (2, 2, 55, "sqrt", 10, {}, False, False, False, False,[0.5], [4.1, 5.64]), # noqa Square root transformation - (2, 2, 55, "cbrt", 10, {}, False, False, False, False,[0.5], [4.1, 5.64]), # noqa Cube root transformation - (2, 2, 55, None, 0, {"max_samples_leaf": 0.5}, False, False, False, False, [0.5], [4.1, 6.2]), # noqa # Different criterion - (2, 5, 55, None, 0, {}, True, False, False, False, [0.5], [4.1, 4.6]), # noqa Include an additional dynamic feature - (2, 5, 55, None, 0, {}, False, True, False, False, [0.5], [4.1, 5.65]), # noqa Include an additional static feature - (2, 5, 55, None, 0, {}, True, True, False, False, [0.5], [4.1, 4.6]), # noqa Include an additional dynamic and static feature - (2, 2, 55, None, 0, {}, False, False, True, False, [0.5], [4.1, 5.65]), # noqa NaNs in input data - (2, 2, 55, None, 0, {}, False, False, False, True, [0.5], [4.1, 5.65]), # noqa NaNs in lat/lon - (2, 2, 55, None, 0, {}, True, False, False, True, [0.5], [4.1, 4.6]), # noqa NaNs in lat/lon and dynamic feature - (2, 2, 55, None, 0, {}, False, True, False, True, [0.5], [4.1, 5.65]), # noqa NaNs in lat/lon and static feature - (2, 2, 55, None, 0, {}, True, True, False, True, [0.5], [4.1, 4.6]), # noqa NaNs in lat/lon, dynamic and static feature - (2, 2, 55, None, 0, {}, True, True, True, True, [0.5], [4.6, 4.6]), # noqa NaNs in lat/lon, dynamic and static feature and input data + (2, 2, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 5.65], None), # noqa: E501 Basic test case + ( + 100, + 2, + 55, + None, + 0, + {}, + False, + False, + False, + False, + [1 / 3, 2 / 3], + [[4.1, 5.1], [5.1, 5.1]], + None, + ), # noqa: E501 Multiple quantiles + (1, 1, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 6.2], None), # noqa: E501 Fewer estimators and reduced depth + (1, 1, 73, None, 0, {}, False, False, False, False, [0.5], [4.2, 6.2], None), # noqa: E501 Different random state + (2, 2, 55, "log", 10, {}, False, False, False, False, [0.5], [4.1, 5.64], None), # noqa: E501 Log transformation + ( + 2, + 2, + 55, + "log10", + 10, + {}, + False, + False, + False, + False, + [0.5], + [4.1, 5.64], + None, + ), # noqa: E501 Log10 transformation + ( + 2, + 2, + 55, + "sqrt", + 10, + {}, + False, + False, + False, + False, + [0.5], + [4.1, 5.64], + None, + ), # noqa: E501 Square root transformation + ( + 2, + 2, + 55, + "cbrt", + 10, + {}, + False, + False, + False, + False, + [0.5], + [4.1, 5.64], + None, + ), # noqa: E501 Cube root transformation + ( + 2, + 2, + 55, + None, + 0, + {"max_samples_leaf": 0.5}, + False, + False, + False, + False, + [0.5], + [4.1, 6.2], + None, + ), # noqa: E501 Different criterion + (2, 5, 55, None, 0, {}, True, False, False, False, [0.5], [4.1, 4.6], None), # noqa: E501 Include an additional dynamic feature + (2, 5, 55, None, 0, {}, False, True, False, False, [0.5], [4.1, 5.65], None), # noqa: E501 Include an additional static feature + (2, 5, 55, None, 0, {}, True, True, False, False, [0.5], [4.1, 4.6], None), # noqa: E501 Include an additional dynamic and static feature + (2, 2, 55, None, 0, {}, False, False, True, False, [0.5], [4.1, 5.65], None), # noqa: E501 NaNs in input data + ( + 2, + 2, + 55, + None, + 0, + {}, + False, + False, + False, + True, + [0.5], + [4.1, 5.65], + [4.1, 5.1], + ), # noqa: E501 NaNs in lat/lon + ( + 2, + 2, + 55, + None, + 0, + {}, + True, + False, + False, + True, + [0.5], + [4.1, 4.6], + [4.1, 5.1], + ), # noqa: E501 NaNs in lat/lon and dynamic feature + ( + 2, + 2, + 55, + None, + 0, + {}, + False, + True, + False, + True, + [0.5], + [4.1, 5.65], + [4.1, 5.1], + ), # noqa: E501 NaNs in lat/lon and static feature + (2, 2, 55, None, 0, {}, True, True, False, True, [0.5], [4.1, 4.6], [4.1, 5.1]), # noqa: E501 NaNs in lat/lon, dynamic and static feature + (2, 2, 55, None, 0, {}, True, True, True, True, [0.5], [4.6, 4.6], [4.6, 5.1]), # noqa: E501 NaNs in lat/lon, dynamic and static feature and input data ], ) def test_prepare_and_apply_qrf( @@ -83,6 +197,7 @@ def test_prepare_and_apply_qrf( include_latlon_nans, quantiles, expected, + expected2, ): """Test the PrepareAndApplyQRF plugin.""" feature_config = {"wind_speed_at_10m": ["mean", "std", "latitude", "longitude"]} @@ -92,6 +207,10 @@ def test_prepare_and_apply_qrf( if include_static: feature_config["distance_to_water"] = ["static"] + unique_site_id_key = site_id[0] + if site_id == ["latitude", "longitude", "altitude"]: + unique_site_id_key = "station_id" + qrf_model = _run_train_qrf( feature_config, n_estimators, @@ -111,7 +230,7 @@ def test_prepare_and_apply_qrf( ], realization_data=[2, 6, 10], truth_data=[4.2, 6.2, 4.1, 5.1], - site_id=site_id, + site_id=unique_site_id_key, ) frt = "20170103T0000Z" @@ -163,12 +282,20 @@ def test_prepare_and_apply_qrf( result = PrepareAndApplyQRF( feature_config, "wind_speed_at_10m", - unique_site_id_key=site_id, + unique_site_id_keys=site_id, )(cube_inputs, (qrf_model, transformation, pre_transform_addition)) assert isinstance(result, Cube) assert result.data.shape == (len(quantiles), 2) - assert np.allclose(result.data, expected, rtol=1e-2) + + if ( + expected2 + and include_latlon_nans + and site_id == ["latitude", "longitude", "altitude"] + ): + assert np.allclose(result.data, expected2, rtol=1e-2) + else: + assert np.allclose(result.data, expected, rtol=1e-2) # Check that the metadata is as expected assert result.name() == "wind_speed_at_10m" diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py index 7a3af15da6..db823c67e7 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py @@ -357,7 +357,7 @@ def amend_expected_forecast_df( base_df, additional_df[ [ - site_id, + *site_id, "forecast_reference_time", "forecast_period", representation, @@ -365,7 +365,7 @@ def amend_expected_forecast_df( ] ].rename(columns={"forecast": parquet_diagnostic_name}), on=[ - site_id, + *site_id, "forecast_reference_time", "forecast_period", representation, @@ -389,7 +389,9 @@ def amend_expected_truth_df(truth_df, parquet_diagnostic_name): @pytest.mark.parametrize("include_static", [True, False]) @pytest.mark.parametrize("include_noncube_static", [True, False]) @pytest.mark.parametrize("remove_target", [True, False]) -@pytest.mark.parametrize("site_id", ["wmo_id", "station_id"]) +@pytest.mark.parametrize( + "site_id", ["wmo_id", "station_id", ["wmo_id"], ["latitude", "longitude"]] +) @pytest.mark.parametrize( "forecast_creation,truth_creation,forecast_periods", [ @@ -431,6 +433,9 @@ def test_load_for_qrf( feature_config = {"air_temperature": ["mean", "std", "altitude"]} parquet_diagnostic_names = ["temperature_at_screen_level"] + if isinstance(site_id, str): + site_id = [site_id] + forecast_path, base_expected_forecast_df, site_ids = forecast_creation( tmp_path, representation ) @@ -476,7 +481,7 @@ def test_load_for_qrf( forecast_periods=forecast_periods, cycletime="20170103T0000Z", training_length=2, - unique_site_id_key=site_id, + unique_site_id_keys=site_id, ) forecast_df, truth_df, cube_inputs = plugin(file_paths) @@ -700,7 +705,7 @@ def test_unexpected( @pytest.mark.parametrize("remove_target", [True, False]) @pytest.mark.parametrize("include_nans", [True, False]) @pytest.mark.parametrize("include_latlon_nans", [True, False]) -@pytest.mark.parametrize("site_id", ["wmo_id", "station_id"]) +@pytest.mark.parametrize("site_id", ["wmo_id", "station_id", ["wmo_id"], ["latitude", "longitude"]]) @pytest.mark.parametrize( "forecast_creation,truth_creation,forecast_periods", [ @@ -747,6 +752,9 @@ def test_prepare_and_train_qrf( random_state = 46 target_cf_name = "air_temperature" + if isinstance(site_id, str): + site_id = [site_id] + _, forecast_df, site_ids = forecast_creation(tmp_path, representation) forecast_df = amend_expected_forecast_df( forecast_df, @@ -766,9 +774,7 @@ def test_prepare_and_train_qrf( feature_config["wind_speed_at_10m"] = ["mean", "std"] if include_static: - _, ancil_cube = _create_ancil_file( - tmp_path, sorted(list(set(site_ids))) - ) + _, ancil_cube = _create_ancil_file(tmp_path, sorted(list(set(site_ids)))) feature_config["distance_to_water"] = ["static"] if include_noncube_static: @@ -800,7 +806,7 @@ def test_prepare_and_train_qrf( random_state=random_state, transformation="log", pre_transform_addition=1, - unique_site_id_key=site_id, + unique_site_id_keys=site_id, ) if truth_df["ob_value"].isna().all(): with pytest.raises(ValueError, match="Empty truth DataFrame"): diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py index 7ab26f69e1..62e9e4a865 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py @@ -20,8 +20,8 @@ ApplyQuantileRegressionRandomForests, TrainQuantileRegressionRandomForests, _check_valid_transformation, - prep_features_from_config, prep_feature, + prep_features_from_config, quantile_forest_package_available, sanitise_forecast_dataframe, ) @@ -178,9 +178,7 @@ def _run_train_qrf( realization_data = np.array(realization_data, dtype=np.float32) forecast_dfs = [] for index, (frt, vt) in enumerate(zip(forecast_reference_times, validity_times)): - forecast_df = _create_forecasts( - frt, vt, realization_data + index - ) + forecast_df = _create_forecasts(frt, vt, realization_data + index) forecast_dfs.append(forecast_df) forecast_df = pd.concat(forecast_dfs) forecast_df = _add_day_of_training_period(forecast_df) @@ -218,7 +216,7 @@ def _run_train_qrf( random_state=random_state, transformation=transformation, pre_transform_addition=pre_transform_addition, - unique_site_id_key=site_id, + unique_site_id_keys=site_id, **extra_kwargs, ) result = plugin.process(forecast_df, truth_df) From 8a55be7075dd2f110e61cd682e7271d03a3e7048 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Wed, 17 Sep 2025 17:04:54 +0100 Subject: [PATCH 10/15] Updates to CLIs and acceptance test-related code. --- ...apply_quantile_regression_random_forest.py | 5 +- ...train_quantile_regression_random_forest.py | 10 +-- .../quantile_regression_random_forest.py | 16 ++--- ...apply_quantile_regression_random_forest.py | 14 ++-- ...train_quantile_regression_random_forest.py | 15 ++--- improver/utilities/compare.py | 57 ++++++++++------ improver_tests/acceptance/SHA256SUMS | 12 ++-- improver_tests/acceptance/acceptance.py | 12 ++-- ...apply_quantile_regression_random_forest.py | 7 -- ...train_quantile_regression_random_forest.py | 2 +- .../test_dataframe_utilities.py | 2 +- improver_tests/cli/test_init.py | 67 +++++++------------ 12 files changed, 104 insertions(+), 115 deletions(-) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index 3047d3d2a3..b05da2223a 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -68,9 +68,8 @@ def __init__( will be used to separate it from the rest of the dynamic predictors, if present. unique_site_id_keys (list): - If working with spot data and available, a list of the names of the - coordinates in the input cubes that are to be used to uniquely identify - a site. For example, ["wmo_id"] or ["latitude", "longitude"]. + The names of the coordinates that uniquely identify each site, + e.g. "wmo_id" or "latitude,longitude". """ self.feature_config = feature_config self.target_cf_name = target_cf_name diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index cd90df9192..2f5394e18e 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -72,9 +72,9 @@ def __init__( cycletime: The time at which the forecast is valid in the form: YYYYMMDDTHHMMZ. training_length: The number of days of training data to use. - experiment: The name of the experiment (step) that calibration is applied to. - unique_site_id_key: The name of the column in the parquet files that - contains the unique site identifier e.g. wmo_id. + experiment: The name of the experiment (step) that calibration is applied to. + unique_site_id_key: The names of the coordinates that uniquely identify + each site, e.g. "wmo_id" or "latitude,longitude". """ self.quantile_forest_installed = quantile_forest_package_available() self.feature_config = feature_config @@ -329,8 +329,8 @@ def __init__( random_state: Seed used by the random number generator. transformation: Transformation to be applied to the data before fitting. pre_transform_addition: Value to be added before transformation. - unique_site_id_key: The name of the column in the parquet files that - contains the unique site identifier e.g. wmo_id. + unique_site_id_key: The names of the coordinates that uniquely identify + each site, e.g. "wmo_id" or "latitude,longitude". """ self.feature_config = feature_config self.target_cf_name = target_cf_name diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index 8014ce9632..63e5508a82 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -51,8 +51,8 @@ def prep_feature( feature_name: Feature to be computed. Options are "mean", "std", "day_of_year", "day_of_year_sin", "day_of_year_cos", "hour_of_day", "hour_of_day_sin" and "hour_of_day_cos". - unique_site_id_keys: The name of the column in the DataFrame that - contains the unique site identifier e.g. wmo_id. + unique_site_id_keys: The names of the coordinates that uniquely identify + each site, e.g. "wmo_id" or "latitude,longitude". Returns: df: DataFrame with the computed feature added. """ @@ -172,8 +172,8 @@ def prep_features_from_config( Args: df: Input DataFrame. feature_config: Feature configuration defining the features to be used for QRF. - unique_site_id_keys: The name of the column in the DataFrame that - contains the unique site identifier e.g. wmo_id. + unique_site_id_keys: The names of the coordinates that uniquely identify + each site, e.g. "wmo_id" or "latitude,longitude". Returns: Processed DataFrame and a list of expected column names that will be used as features with the QRF. @@ -279,8 +279,8 @@ def __init__( Transformation to be applied to the data before fitting. pre_transform_addition (float): Value to be added before transformation. - unique_site_id_keys: The name of the column in the DataFrame that - contains the unique site identifier e.g. wmo_id. + unique_site_id_keys: The names of the coordinates that uniquely identify + each site, e.g. "wmo_id" or "latitude,longitude". kwargs: Additional keyword arguments for the quantile regression model. @@ -420,8 +420,8 @@ def __init__( Transformation to be applied to the data before fitting. pre_transform_addition (float): Value to be added before transformation. - unique_site_id_keys: The names of the column in the DataFrame that - uniquely identify a site e.g. ["wmo_id"] or ["latitude", "longitude"]. + unique_site_id_keys: The names of the coordinates that uniquely identify + each site, e.g. "wmo_id" or "latitude,longitude". """ self.target_name = target_name diff --git a/improver/cli/apply_quantile_regression_random_forest.py b/improver/cli/apply_quantile_regression_random_forest.py index c442b0342b..488d49f54d 100644 --- a/improver/cli/apply_quantile_regression_random_forest.py +++ b/improver/cli/apply_quantile_regression_random_forest.py @@ -14,7 +14,7 @@ def process( *file_paths: cli.inputpath, feature_config: cli.inputjson, target_cf_name: str, - unique_site_id_key: str = "wmo_id", + unique_site_id_keys: cli.comma_separated_list = "wmo_id", ): """Applying the Quantile Regression Random Forest model. @@ -50,10 +50,8 @@ def process( A string containing the CF name of the forecast to be calibrated e.g. air_temperature. This will be used to separate it from the rest of the feature cubes, if present. - unique_site_id_key (str): - If working with spot data and available, the name of the coordinate - in the input cubes that contains unique site IDs, e.g. "wmo_id" if - all sites have a valid wmo_id. + The names of the coordinates that uniquely identify each site, + e.g. "wmo_id" or "latitude,longitude". Returns: iris.cube.Cube: The calibrated forecast cube. @@ -63,11 +61,11 @@ def process( PrepareAndApplyQRF, ) - cubes, _, qrf_model = split_pickle_parquet_and_netcdf(file_paths) + cubes, _, qrf_descriptors = split_pickle_parquet_and_netcdf(file_paths) result = PrepareAndApplyQRF( feature_config=feature_config, target_cf_name=target_cf_name, - unique_site_id_key=unique_site_id_key, - )(cubes, qrf_model=qrf_model) + unique_site_id_keys=unique_site_id_keys, + )(cubes, qrf_descriptors=qrf_descriptors) return result diff --git a/improver/cli/train_quantile_regression_random_forest.py b/improver/cli/train_quantile_regression_random_forest.py index 4802cdbb26..f22c0faa1e 100644 --- a/improver/cli/train_quantile_regression_random_forest.py +++ b/improver/cli/train_quantile_regression_random_forest.py @@ -25,7 +25,7 @@ def process( random_state: int = None, transformation: str = None, pre_transform_addition: float = 0, - unique_site_id_key: str = "wmo_id", + unique_site_id_keys: cli.comma_separated_list = "wmo_id", ): """Training a model using Quantile Regression Random Forest. @@ -61,7 +61,7 @@ def process( "distance_to_water": ["static"], } parquet_diagnostic_names (str): - A string containing the diagnostic name that will be used for filtering + A string containing the diagnostic names that will be used for filtering the target diagnostic from the forecast and truth DataFrames read in from the parquet files. This could be different from the CF name e.g. 'temperature_at_screen_level'. @@ -100,10 +100,9 @@ def process( Transformation to be applied to the data before fitting. pre_transform_addition (float): Value to be added before transformation. - unique_site_id_key (str): - If working with spot data and available, the name of the coordinate - in the input cubes that contains unique site IDs, e.g. "wmo_id" if - all sites have a valid wmo_id. + unique_site_id_keys (str): + The names of the coordinates that uniquely identify each site, + e.g. "wmo_id" or "latitude,longitude". Returns: A quantile regression random forest model. """ @@ -121,7 +120,7 @@ def process( forecast_periods=forecast_periods, cycletime=cycletime, training_length=training_length, - unique_site_id_key=unique_site_id_key, + unique_site_id_keys=unique_site_id_keys, )(file_paths) result = PrepareAndTrainQRF( feature_config=feature_config, @@ -132,7 +131,7 @@ def process( random_state=random_state, transformation=transformation, pre_transform_addition=pre_transform_addition, - unique_site_id_key=unique_site_id_key, + unique_site_id_keys=unique_site_id_keys, )(forecast_df, truth_df, cube_inputs) return result diff --git a/improver/utilities/compare.py b/improver/utilities/compare.py index 819472c427..ce48145ad3 100644 --- a/improver/utilities/compare.py +++ b/improver/utilities/compare.py @@ -408,12 +408,18 @@ def raise_reporter(message): difference_found = False try: - np.testing.assert_equal(output.n_features_in_, kgo.n_features_in_) - np.testing.assert_equal(output.n_outputs_, kgo.n_outputs_) - np.testing.assert_equal(output.max_depth, kgo.max_depth) - np.testing.assert_equal(output.n_estimators, kgo.n_estimators) - np.testing.assert_equal(output.random_state, kgo.random_state) - for output_estimator, kgo_estimator in zip(output.estimators_, kgo.estimators_): + model, transformation, pre_transform_addition = output + kgo_model, kgo_transformation, kgo_pre_transform_addition = kgo + np.testing.assert_equal(transformation, kgo_transformation) + np.testing.assert_equal(pre_transform_addition, kgo_pre_transform_addition) + np.testing.assert_equal(model.n_features_in_, kgo_model.n_features_in_) + np.testing.assert_equal(model.n_outputs_, kgo_model.n_outputs_) + np.testing.assert_equal(model.max_depth, kgo_model.max_depth) + np.testing.assert_equal(model.n_estimators, kgo_model.n_estimators) + np.testing.assert_equal(model.random_state, kgo_model.random_state) + for output_estimator, kgo_estimator in zip( + model.estimators_, kgo_model.estimators_ + ): np.testing.assert_allclose( output_estimator.tree_.value, kgo_estimator.tree_.value ) @@ -422,18 +428,27 @@ def raise_reporter(message): # call the reporter function outside the except block to avoid nested # exceptions if the reporter function is raising an exception if difference_found: - msg = ( - "Different pickled forest. \nThe output is: " - "n_features: {output.n_features_in_}, " - "n_outputs: {output.n_outputs_}, " - "max_depth: {output.max_depth}, " - "n_estimators: {output.n_estimators}, " - "random_state: {output.random_state}." - "\nThe KGO is: " - "n_features: {kgo.n_features_in_}, " - "n_outputs: {kgo.n_outputs_}, " - "max_depth: {kgo.max_depth}, " - "n_estimators: {kgo.n_estimators}, " - "random_state: {kgo.random_state}" - ) - reporter(msg) + try: + msg = ( + "Different pickled forest. \nThe output is: " + f"n_features: {model.n_features_in_}, " + f"n_outputs: {model.n_outputs_}, " + f"max_depth: {model.max_depth}, " + f"n_estimators: {model.n_estimators}, " + f"random_state: {model.random_state}, " + f"transformation: {transformation}, " + f"pre_transform_addition: {pre_transform_addition}." + "\nThe KGO is: " + f"n_features: {kgo.n_features_in_}, " + f"n_outputs: {kgo.n_outputs_}, " + f"max_depth: {kgo.max_depth}, " + f"n_estimators: {kgo.n_estimators}, " + f"random_state: {kgo.random_state}, " + f"transformation: {kgo_transformation}, " + f"pre_transform_addition: {kgo_pre_transform_addition}." + ) + except (AssertionError, ValueError, UnboundLocalError): + msg = "Different pickled forest." + reporter(msg) + else: + reporter(msg) diff --git a/improver_tests/acceptance/SHA256SUMS b/improver_tests/acceptance/SHA256SUMS index ddae0a26d4..8fe8c48a99 100644 --- a/improver_tests/acceptance/SHA256SUMS +++ b/improver_tests/acceptance/SHA256SUMS @@ -86,10 +86,10 @@ ed8ab78a9f55b54bf0a49f191d3eb33daae30e0d05b9051275a70c0e697aac71 ./apply-night- 809a446327626d007ac288b8277520730863bc596e98116bca5c4afb0d531e96 ./apply-night-mask/uk_prob/valid_input.nc d2f7d8389b33cde359dd253def2aaf31afbc27557f389bf0f744a28b24e145dd ./apply-quantile-regression-random-forest/config.json 8db1b35bde734c16340a6e42454b9ac146ea32f7c012c17c5e64815069af41bf ./apply-quantile-regression-random-forest/input_forecast.nc -793f17191d2421ead8d546319c7da34143ef01e2db88ca6cc5faf46e92b4b1de ./apply-quantile-regression-random-forest/with_transformation_input.pickle -d8acadfe782e2c2562dfcc31ada8ad381b320fbdf3854d6b361c38308e6b8226 ./apply-quantile-regression-random-forest/with_transformation_kgo.nc -31eab8a83a75eb8cee4c0140749a4d4e9d9b98292c28b3c53ee27756c1f3a265 ./apply-quantile-regression-random-forest/without_transformation_input.pickle -65397673caf0f1205d18a85c08ce81bd3543a71c4101f40f4869a93ddd0cf0f8 ./apply-quantile-regression-random-forest/without_transformation_kgo.nc +0f74c2d6e3caf30c4f971da2dc757b1ae659062e31194e4da8e5266a1f23d2af ./apply-quantile-regression-random-forest/with_transformation_kgo.nc +88d5ea229b42789d92d2317099b90d90f89a57e037665091b49c5f19acfea14e ./apply-quantile-regression-random-forest/with_transformation_kgo.pickle +0f74c2d6e3caf30c4f971da2dc757b1ae659062e31194e4da8e5266a1f23d2af ./apply-quantile-regression-random-forest/without_transformation_kgo.nc +8e2dda09c26d33fd06d68c5be17a74b93b1d75d54c60ce48f27ea3d2735bece9 ./apply-quantile-regression-random-forest/without_transformation_kgo.pickle 74a40c508b68294652f5dde52d8fe4b93ea002bd4b72ca8d6cc11a83309f28f1 ./apply-rainforests-calibration/basic/kgo.nc b4678f178e0e1df0c242c61b269b71dd9a3d7aca64663e2e81e5bfeb71fe97da ./apply-rainforests-calibration/features/20200802T0000Z-PT0000H00M-clearsky_solar_radiation-PT24H.nc dce2d4a28cec494c94b154dc3a9203b46b7cef2ad54d04049663166f4b42ecf5 ./apply-rainforests-calibration/features/20200802T0000Z-PT0024H00M-cape-PT24H.nc @@ -1003,8 +1003,8 @@ e3d2315b24bf769bcfb137033950d7100c8795878635e0ec9491413a1349904a ./train-quanti 1aec562fc0c39c7695521482b889d335a3becb10a76b7eae965447cbd9faf051 ./train-quantile-regression-random-forest/spot_observation_tables/20250804/diagnostic=temperature_at_screen_level/0600Z_0.parquet e81c04f9f2d8dc14b8d8cd9bc9e8827ebd0da5ba016db48cf60ab738d327f968 ./train-quantile-regression-random-forest/spot_observation_tables/20250804/diagnostic=temperature_at_screen_level/1200Z_0.parquet 99e6656982672ea2d4cb9fa1dfb5731408dd41b2c9052b7e0df8d8004864241e ./train-quantile-regression-random-forest/spot_observation_tables/20250804/diagnostic=temperature_at_screen_level/1800Z_0.parquet -d634d639dda9a4a2789eed69bfdec59954ce63f51734a71f1d9a20c3958d2786 ./train-quantile-regression-random-forest/with_transformation_kgo.pickle -e3f9fcd7544028436e34415db38d3c5841a6e22abbc3fc635e6ff87236764fcc ./train-quantile-regression-random-forest/without_transformation_kgo.pickle +88d5ea229b42789d92d2317099b90d90f89a57e037665091b49c5f19acfea14e ./train-quantile-regression-random-forest/with_transformation_kgo.pickle +8e2dda09c26d33fd06d68c5be17a74b93b1d75d54c60ce48f27ea3d2735bece9 ./train-quantile-regression-random-forest/without_transformation_kgo.pickle e2cfb5e19ef5ebcfd73dc1c8504b66e9a55ad76b7b18cb79318659224079f234 ./uv-index/basic/20181210T0600Z-PT0000H00M-radiation_flux_in_uv_downward_at_surface.nc e48c3b07bd14214a1a10c0bbf1e0acdf54cfedf93b2c6d766f594ce83a45c2b8 ./uv-index/basic/kgo.nc 2226a5e95eb29664e21f8c0658101a53d65945a1450a60a19955563a2693d22d ./vertical-updraught/cape.nc diff --git a/improver_tests/acceptance/acceptance.py b/improver_tests/acceptance/acceptance.py index 3616f225be..d69a9560bb 100644 --- a/improver_tests/acceptance/acceptance.py +++ b/improver_tests/acceptance/acceptance.py @@ -277,12 +277,12 @@ def recreate_if_needed(output_path, kgo_path, recreate_dir_path=None): print("Original KGO file does not exist") kgo_relative = kgo_path.relative_to(kgo_root_dir) recreate_file_path = recreate_dir_path / kgo_relative - if recreate_file_path == kgo_path: - err = ( - f"Recreate KGO path {recreate_file_path} must be different from" - f" original KGO path {kgo_path} to avoid overwriting" - ) - raise IOError(err) + # if recreate_file_path == kgo_path: + # err = ( + # f"Recreate KGO path {recreate_file_path} must be different from" + # f" original KGO path {kgo_path} to avoid overwriting" + # ) + # raise IOError(err) recreate_file_path.parent.mkdir(exist_ok=True, parents=True) if recreate_file_path.exists(): recreate_file_path.unlink() diff --git a/improver_tests/acceptance/test_apply_quantile_regression_random_forest.py b/improver_tests/acceptance/test_apply_quantile_regression_random_forest.py index 5cd27f5f17..6f43606dc1 100644 --- a/improver_tests/acceptance/test_apply_quantile_regression_random_forest.py +++ b/improver_tests/acceptance/test_apply_quantile_regression_random_forest.py @@ -36,12 +36,5 @@ def test_basic(tmp_path, transformation): output_path, ] - if transformation == "with_transformation": - args += [ - "--transformation", - "log", - "--pre-transform-addition", - "0.1", - ] run_cli(args) acc.compare(output_path, kgo_path, atol=LOOSE_TOLERANCE) diff --git a/improver_tests/acceptance/test_train_quantile_regression_random_forest.py b/improver_tests/acceptance/test_train_quantile_regression_random_forest.py index 75073355c2..3fbb571ded 100644 --- a/improver_tests/acceptance/test_train_quantile_regression_random_forest.py +++ b/improver_tests/acceptance/test_train_quantile_regression_random_forest.py @@ -41,7 +41,7 @@ def test_basic( named_args = [ "--feature-config", config_path, - "--target-diagnostic-name", + "--parquet-diagnostic-names", "temperature_at_screen_level", "--target-cf-name", "air_temperature", diff --git a/improver_tests/calibration/dataframe_utilities/test_dataframe_utilities.py b/improver_tests/calibration/dataframe_utilities/test_dataframe_utilities.py index ad46fdba43..fb9bceff37 100644 --- a/improver_tests/calibration/dataframe_utilities/test_dataframe_utilities.py +++ b/improver_tests/calibration/dataframe_utilities/test_dataframe_utilities.py @@ -977,7 +977,7 @@ def test_not_quantiles(self): self.forecast_df = self.forecast_df.replace( {"percentile": self.percentiles[0]}, 10.0 ) - msg = "The forecast percentiles can not be considered as quantiles" + msg = "Forecast percentiles must be equally spaced." with self.assertRaisesRegex(ValueError, msg): forecast_and_truth_dataframes_to_cubes( self.forecast_df, diff --git a/improver_tests/cli/test_init.py b/improver_tests/cli/test_init.py index b8a8ea5bda..10b50d3d57 100644 --- a/improver_tests/cli/test_init.py +++ b/improver_tests/cli/test_init.py @@ -45,6 +45,8 @@ def dummy_function(first, second=0, third=2): (iris.cube.Cube) """ + if isinstance(first, list): + return first[0] first = int(first) return first + first @@ -187,72 +189,55 @@ def test_without_output(self, m): self.assertEqual(result, 4) @patch("improver.utilities.save.save_netcdf") - def test_with_output(self, m): + def test_with_output_cube(self, m): """Tests that save_netcdf is called with object and string, default compression_level=1 and default least_significant_digit=None""" - # pylint disable is needed as it can't see the wrappers output kwarg. - result = wrapped_with_output.cli("argv[0]", "2", "--output=foo.nc") - m.assert_called_with(4, "foo.nc", 1, None) + save_object = Cube([0]) + result = wrapped_with_output.cli("argv[0]", [save_object], "--output=foo") + m.assert_called_with(save_object, "foo", 1, None) + self.assertEqual(result, None) + + @patch("joblib.dump") + @patch("builtins.open", unittest.mock.mock_open()) + def test_with_output_pickle(self, m): + """Tests that joblib.dump is called for a non-cube object""" + save_object = {"a": 1} + result = wrapped_with_output.cli("argv[0]", [save_object], "--output=foo") + m.assert_called_with(save_object, "foo", compress=1) self.assertEqual(result, None) @patch("improver.utilities.save.save_netcdf") def test_with_output_compression_level(self, m): """Tests save_netcdf, compression-level=9 and default least-significant-digit=None""" - # pylint disable is needed as it can't see the wrappers output kwarg. + save_object = Cube([0]) result = wrapped_with_output.cli( - "argv[0]", "2", "--output=foo.nc", "--compression-level=9" + "argv[0]", [save_object], "--output=foo", "--compression-level=9" ) - m.assert_called_with(4, "foo.nc", 9, None) + m.assert_called_with(save_object, "foo", 9, None) self.assertEqual(result, None) @patch("improver.utilities.save.save_netcdf") def test_with_output_no_compression(self, m): """Tests save_netcdf, compression-level=0 and default least-significant-digit=None""" - # pylint disable is needed as it can't see the wrappers output kwarg. + save_object = Cube([0]) result = wrapped_with_output.cli( - "argv[0]", "2", "--output=foo.nc", "--compression-level=0" + "argv[0]", [save_object], "--output=foo", "--compression-level=0" ) - m.assert_called_with(4, "foo.nc", 0, None) + m.assert_called_with(save_object, "foo", 0, None) self.assertEqual(result, None) @patch("improver.utilities.save.save_netcdf") def test_with_output_with_least_significant_figure(self, m): """Tests save_netcdf, compression-level=0 and least-significant-digit=2""" - # pylint disable is needed as it can't see the wrappers output kwarg. + save_object = Cube([0]) result = wrapped_with_output.cli( "argv[0]", - "2", - "--output=foo.nc", + [save_object], + "--output=foo", "--compression-level=0", "--least-significant-digit=2", ) - m.assert_called_with(4, "foo.nc", 0, 2) - self.assertEqual(result, None) - - @patch("joblib.dump") - def test_with_output_pickle(self, m): - """Tests that joblib.dump is called with object and string, default - compression_level=1 and default least_significant_digit=None""" - # pylint disable is needed as it can't see the wrappers output kwarg. - result = wrapped_with_output.cli( - "argv[0]", - "2", - "--output=foo.pickle", - ) - m.assert_called_with(4, "foo.pickle", compress=1) - self.assertEqual(result, None) - - @patch("joblib.dump") - def test_with_output_pkl(self, m): - """Tests that joblib.dump is called with object and string, default - compression_level=1 and default least_significant_digit=None""" - # pylint disable is needed as it can't see the wrappers output kwarg. - result = wrapped_with_output.cli( - "argv[0]", - "2", - "--output=foo.pkl", - ) - m.assert_called_with(4, "foo.pkl", compress=1) + m.assert_called_with(save_object, "foo", 0, 2) self.assertEqual(result, None) @@ -439,4 +424,4 @@ def test_help_no_stderr(): if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file From 5ad77aca159144be03fb18a20ba961d34e4614bc Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Wed, 17 Sep 2025 17:26:37 +0100 Subject: [PATCH 11/15] Minor format fixes. --- .../load_and_apply_quantile_regression_random_forest.py | 2 +- .../test_load_and_train_quantile_regression_random_forest.py | 4 +++- improver_tests/cli/test_init.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index b05da2223a..09633c5d3a 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -68,7 +68,7 @@ def __init__( will be used to separate it from the rest of the dynamic predictors, if present. unique_site_id_keys (list): - The names of the coordinates that uniquely identify each site, + The names of the coordinates that uniquely identify each site, e.g. "wmo_id" or "latitude,longitude". """ self.feature_config = feature_config diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py index db823c67e7..b345dee47e 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py @@ -705,7 +705,9 @@ def test_unexpected( @pytest.mark.parametrize("remove_target", [True, False]) @pytest.mark.parametrize("include_nans", [True, False]) @pytest.mark.parametrize("include_latlon_nans", [True, False]) -@pytest.mark.parametrize("site_id", ["wmo_id", "station_id", ["wmo_id"], ["latitude", "longitude"]]) +@pytest.mark.parametrize( + "site_id", ["wmo_id", "station_id", ["wmo_id"], ["latitude", "longitude"]] +) @pytest.mark.parametrize( "forecast_creation,truth_creation,forecast_periods", [ diff --git a/improver_tests/cli/test_init.py b/improver_tests/cli/test_init.py index 10b50d3d57..a2782687fc 100644 --- a/improver_tests/cli/test_init.py +++ b/improver_tests/cli/test_init.py @@ -424,4 +424,4 @@ def test_help_no_stderr(): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From 873523a74d1ec178f5a5798bc6516e052e9cc1a4 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Wed, 17 Sep 2025 17:38:55 +0100 Subject: [PATCH 12/15] Copy doc/source/conf.py from #2186 and move pyarrow import into function. --- doc/source/conf.py | 6 +++--- improver/calibration/__init__.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index a7a54b7e2d..15972d5552 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -370,11 +370,11 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), - "iris": ("https://scitools-iris.readthedocs.io/en/latest/", None), - "cartopy": ("https://scitools.org.uk/cartopy/docs/latest/", None), + "iris": ("https://scitools-iris.readthedocs.io/en/stable/", None), + "cartopy": ("https://cartopy.readthedocs.io/stable/", None), "cf_units": ("https://cf-units.readthedocs.io/en/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None), - "scipy": ("https://docs.scipy.org/doc/scipy-1.6.2/reference/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), "pandas": ("https://pandas.pydata.org/pandas-docs/dev/", None), } diff --git a/improver/calibration/__init__.py b/improver/calibration/__init__.py index 2333a94cca..75e3f99706 100644 --- a/improver/calibration/__init__.py +++ b/improver/calibration/__init__.py @@ -13,7 +13,6 @@ import iris import joblib import pandas as pd -import pyarrow.parquet as pq from iris.cube import Cube, CubeList from improver.metadata.probabilistic import ( @@ -272,6 +271,7 @@ def split_forecasts_and_bias_files(cubes: CubeList) -> Tuple[Cube, Optional[Cube def split_pickle_parquet_and_netcdf(files): """Split the input files into pickle, parquet, and netcdf files. Only a single pickle file is expected. + Args: files: A list of input file paths which will be split into pickle, @@ -329,6 +329,8 @@ def identify_parquet_type(parquet_paths: List[Path]): - The path to the Parquet file containing the historical forecasts. - The path to the Parquet file containing the truths. """ + import pyarrow.parquet as pq + forecast_table_path = None truth_table_path = None for file_path in parquet_paths: From 69aee1859fa01ca3d7ba7f55d96c8a19725360b7 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Wed, 17 Sep 2025 17:48:21 +0100 Subject: [PATCH 13/15] Move pyarrow import. --- .../load_and_train_quantile_regression_random_forest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 2f5394e18e..550bf8c3a1 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -12,8 +12,6 @@ import iris import numpy as np import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq from iris.pandas import as_data_frame from improver import PostProcessingPlugin @@ -137,6 +135,9 @@ def _read_parquet_files( ValueError: If the truth parquet file does not contain the expected fields. """ + import pyarrow as pa + import pyarrow.parquet as pq + cycletimes = [] for forecast_period in forecast_periods: # Load forecasts from parquet file filtering by diagnostic and blend_time. From 501d97e6bb5a7864f6fde4b2b4e59ac4c4564ce3 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Thu, 18 Sep 2025 16:52:32 +0100 Subject: [PATCH 14/15] Further edits including to support the computation of more statistics. --- .../quantile_regression_random_forest.py | 90 ++++++++-- ...train_quantile_regression_random_forest.py | 6 +- ...apply_quantile_regression_random_forest.py | 167 +++--------------- .../test_quantile_regression_random_forest.py | 46 +++-- 4 files changed, 142 insertions(+), 167 deletions(-) diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index 63e5508a82..630331d2ba 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -34,13 +34,16 @@ def prep_feature( df: pd.DataFrame, variable_name: str, feature_name: str, + transformation: Optional[str] = None, + pre_transform_addition: np.float32 = 0, unique_site_id_keys: Union[list[str], str] = "wmo_id", ) -> pd.DataFrame: """Prepare features that require computation from the input DataFrame. Options - available are mean and standard deviation of the input feature, the - day of year, sine of day of year, cosine of day of year, hour of day, - sine of hour of day and cosine of hour of day. When computing the mean or standard - deviation, these will be computed over either the percentile or realization column, + available are mean, standard deviation, min, max, percentiles and a members above + and a members below count of the input feature, the day of year, + sine of day of year, cosine of day of year, hour of day, sine of hour of day + and cosine of hour of day. When computing the mean or standard deviation, + these will be computed over either the percentile or realization column, depending upon which is available. When a percentile column is provided, the expectation is that these percentiles are equally spaced between 0 and 100, so that these percentiles can be treated as being equally likely. @@ -48,15 +51,28 @@ def prep_feature( Args: df: Input DataFrame. variable_name: Name of the variable to be used for the computation. - feature_name: Feature to be computed. Options are "mean", "std", "day_of_year", - "day_of_year_sin", "day_of_year_cos", "hour_of_day", + feature_name: Feature to be computed. Options are "mean", "std", "min", "max", + "percentile_" where is the required percentile between 0 and + 100, "members_below_" where is the threshold value + to count the number of members below, "members_above_" where + is the threshold value to count the number of members above, + "day_of_year", "day_of_year_sin", "day_of_year_cos", "hour_of_day", "hour_of_day_sin" and "hour_of_day_cos". + transformation: Transformation to be applied to the data before fitting. This + is only used when computing members_below or members_above features. + pre_transform_addition: Value to be added before transformation. This is only + used when computing members_below or members_above features. unique_site_id_keys: The names of the coordinates that uniquely identify each site, e.g. "wmo_id" or "latitude,longitude". Returns: df: DataFrame with the computed feature added. """ - if feature_name in ["mean", "std"]: + if ( + feature_name in ["mean", "std", "min", "max"] + or feature_name.startswith("percentile_") + or feature_name.startswith("members_below") + or feature_name.startswith("members_above") + ): if isinstance(unique_site_id_keys, str): unique_site_id_keys = [unique_site_id_keys] representation_name = [ @@ -81,6 +97,46 @@ def prep_feature( subset_df = df[subset_cols].groupby(groupby_cols).mean() elif feature_name == "std": subset_df = df[subset_cols].groupby(groupby_cols).std() + elif feature_name == "min": + subset_df = df[subset_cols].groupby(groupby_cols).min() + elif feature_name == "max": + subset_df = df[subset_cols].groupby(groupby_cols).max() + elif feature_name.startswith("members_below"): + threshold = float(feature_name.split("_")[2]) + if transformation is not None: + threshold = getattr(np, transformation)( + np.array(threshold) + pre_transform_addition + ) + orig_dtype = df[variable_name].dtype + subset_df = ( + df[subset_cols] + .assign(below_threshold=lambda x: x[variable_name] < threshold) + .groupby(groupby_cols)["below_threshold"] + .sum() + ) + subset_df.rename(variable_name, inplace=True) + subset_df = subset_df.astype(orig_dtype) + # subset_df[variable_name].astype(orig_dtype) + elif feature_name.startswith("members_above"): + threshold = float(feature_name.split("_")[2]) + if transformation is not None: + threshold = getattr(np, transformation)( + np.array(threshold) + pre_transform_addition + ) + orig_dtype = df[variable_name].dtype + subset_df = ( + df[subset_cols] + .assign(above_threshold=lambda x: x[variable_name] > threshold) + .groupby(groupby_cols)["above_threshold"] + .sum() + ) + subset_df.rename(variable_name, inplace=True) + subset_df = subset_df.astype(orig_dtype) + elif feature_name.startswith("percentile_"): + perc = float(feature_name.split("_")[1]) + orig_dtype = df[variable_name].dtype + subset_df = df[subset_cols].groupby(groupby_cols).quantile(perc / 100.0) + subset_df[variable_name] = subset_df[variable_name].astype(orig_dtype) subset_df = subset_df.reset_index() # Rename the column to distinguish the computed feature from the original. @@ -150,7 +206,16 @@ def sanitise_forecast_dataframe( ] collapsed_features = [] for key, values in feature_config.items(): - collapsed_features.extend([key for v in values if v in ["mean", "std"]]) + collapsed_features.extend( + [ + key + for v in values + if v in ["mean", "std", "min", "max"] + or v.startswith("percentile_") + or v.startswith("members_below") + or v.startswith("members_above") + ] + ) collapsed_features = list(set(collapsed_features)) # Subset the dataframe by the first value of the representation column # and drop the representation column and any features where the original variable @@ -193,8 +258,12 @@ def prep_features_from_config( raise ValueError(msg) for feature_name in feature_config[variable_name]: df = prep_feature(df, variable_name, feature_name, unique_site_id_keys) - - if feature_name in ["mean", "std"]: + if ( + feature_name in ["mean", "std", "min", "max"] + or feature_name.startswith("percentile_") + or feature_name.startswith("members_below") + or feature_name.startswith("members_above") + ): feature_column_names.append(f"{variable_name}_{feature_name}") elif feature_name in ["static"]: feature_column_names.append(variable_name) @@ -490,7 +559,6 @@ def process( forecast_df = sanitise_forecast_dataframe(forecast_df, self.feature_config) feature_values = np.array(forecast_df[feature_column_names]) - calibrated_forecast = qrf_model.predict( feature_values, quantiles=self.quantiles ) diff --git a/improver/cli/train_quantile_regression_random_forest.py b/improver/cli/train_quantile_regression_random_forest.py index f22c0faa1e..b69dd1f87e 100644 --- a/improver/cli/train_quantile_regression_random_forest.py +++ b/improver/cli/train_quantile_regression_random_forest.py @@ -92,8 +92,7 @@ def process( to draw from the total number of samples available to train each tree. If None, then each tree contains the same number of samples as the total available. The trees will therefore only differ due to the use of - bootstrapping (i.e. sampling with replacement) when creating the - each tree. + bootstrapping (i.e. sampling with replacement) when creating each tree. random_state (int): Random seed for reproducibility. transformation (str): @@ -104,7 +103,8 @@ def process( The names of the coordinates that uniquely identify each site, e.g. "wmo_id" or "latitude,longitude". Returns: - A quantile regression random forest model. + A quantile regression random forest model with associated transformation and + pre-transformation addition that will be stored as a pickle file. """ from improver.calibration.load_and_train_quantile_regression_random_forest import ( diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py index cacbd00e3a..3f88336ed9 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py @@ -43,145 +43,35 @@ def _add_day_of_training_period_to_cube(cube, day_of_training_period, secondary_ return cube -@pytest.mark.parametrize("percentile_input", [True]) +# fmt: off +@pytest.mark.parametrize("percentile_input", [True, False]) @pytest.mark.parametrize( "site_id", [["wmo_id"], ["station_id"], ["latitude", "longitude", "altitude"]] ) @pytest.mark.parametrize( - "n_estimators,max_depth,random_state,transformation,pre_transform_addition,extra_kwargs,include_dynamic,include_static,include_nans,include_latlon_nans,quantiles,expected,expected2", + "n_estimators,max_depth,random_state,transformation,pre_transform_addition,extra_kwargs,include_dynamic,include_static,include_nans,include_latlon_nans,quantiles,expected", [ - (2, 2, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 5.65], None), # noqa: E501 Basic test case - ( - 100, - 2, - 55, - None, - 0, - {}, - False, - False, - False, - False, - [1 / 3, 2 / 3], - [[4.1, 5.1], [5.1, 5.1]], - None, - ), # noqa: E501 Multiple quantiles - (1, 1, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 6.2], None), # noqa: E501 Fewer estimators and reduced depth - (1, 1, 73, None, 0, {}, False, False, False, False, [0.5], [4.2, 6.2], None), # noqa: E501 Different random state - (2, 2, 55, "log", 10, {}, False, False, False, False, [0.5], [4.1, 5.64], None), # noqa: E501 Log transformation - ( - 2, - 2, - 55, - "log10", - 10, - {}, - False, - False, - False, - False, - [0.5], - [4.1, 5.64], - None, - ), # noqa: E501 Log10 transformation - ( - 2, - 2, - 55, - "sqrt", - 10, - {}, - False, - False, - False, - False, - [0.5], - [4.1, 5.64], - None, - ), # noqa: E501 Square root transformation - ( - 2, - 2, - 55, - "cbrt", - 10, - {}, - False, - False, - False, - False, - [0.5], - [4.1, 5.64], - None, - ), # noqa: E501 Cube root transformation - ( - 2, - 2, - 55, - None, - 0, - {"max_samples_leaf": 0.5}, - False, - False, - False, - False, - [0.5], - [4.1, 6.2], - None, - ), # noqa: E501 Different criterion - (2, 5, 55, None, 0, {}, True, False, False, False, [0.5], [4.1, 4.6], None), # noqa: E501 Include an additional dynamic feature - (2, 5, 55, None, 0, {}, False, True, False, False, [0.5], [4.1, 5.65], None), # noqa: E501 Include an additional static feature - (2, 5, 55, None, 0, {}, True, True, False, False, [0.5], [4.1, 4.6], None), # noqa: E501 Include an additional dynamic and static feature - (2, 2, 55, None, 0, {}, False, False, True, False, [0.5], [4.1, 5.65], None), # noqa: E501 NaNs in input data - ( - 2, - 2, - 55, - None, - 0, - {}, - False, - False, - False, - True, - [0.5], - [4.1, 5.65], - [4.1, 5.1], - ), # noqa: E501 NaNs in lat/lon - ( - 2, - 2, - 55, - None, - 0, - {}, - True, - False, - False, - True, - [0.5], - [4.1, 4.6], - [4.1, 5.1], - ), # noqa: E501 NaNs in lat/lon and dynamic feature - ( - 2, - 2, - 55, - None, - 0, - {}, - False, - True, - False, - True, - [0.5], - [4.1, 5.65], - [4.1, 5.1], - ), # noqa: E501 NaNs in lat/lon and static feature - (2, 2, 55, None, 0, {}, True, True, False, True, [0.5], [4.1, 4.6], [4.1, 5.1]), # noqa: E501 NaNs in lat/lon, dynamic and static feature - (2, 2, 55, None, 0, {}, True, True, True, True, [0.5], [4.6, 4.6], [4.6, 5.1]), # noqa: E501 NaNs in lat/lon, dynamic and static feature and input data + (2, 2, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 5.65]), # Basic test case + (100, 2, 55, None, 0, {}, False, False, False, False, [1 / 3, 2 / 3], [[4.1, 5.1], [5.1, 5.1]]), # Multiple quantiles + (1, 1, 55, None, 0, {}, False, False, False, False, [0.5], [4.1, 6.2]), # Fewer estimators and reduced depth + (1, 1, 73, None, 0, {}, False, False, False, False, [0.5], [4.2, 6.2]), # Different random state + (2, 2, 55, "log", 10, {}, False, False, False, False, [0.5], [4.1, 5.64]), # Log transformation + (2, 2, 55, "log10", 10, {}, False, False, False, False, [0.5], [4.1, 5.64]), # Log10 transformation + (2, 2, 55, "sqrt", 10, {}, False, False, False, False, [0.5], [4.1, 5.64]), # Square root transformation + (2, 2, 55, "cbrt", 10, {}, False, False, False, False, [0.5], [4.1, 5.64]), # Cube root transformation + (2, 2, 55, None, 0, {"max_samples_leaf": 0.5}, False, False, False, False, [0.5], [4.1, 6.2]), # Different criterion + (2, 5, 55, None, 0, {}, True, False, False, False, [0.5], [4.1, 4.6]), # Include an additional dynamic feature + (2, 5, 55, None, 0, {}, False, True, False, False, [0.5], [4.1, 5.65]), # Include an additional static feature + (2, 5, 55, None, 0, {}, True, True, False, False, [0.5], [4.1, 4.6]), # Include an additional dynamic and static feature + (2, 2, 55, None, 0, {}, False, False, True, False, [0.5], [4.1, 5.65]), # NaNs in input data + (2, 2, 55, None, 0, {}, False, False, False, True, [0.5], [4.1, 5.65]), # NaNs in lat/lon + (2, 2, 55, None, 0, {}, True, False, False, True, [0.5], [4.1, 4.6]), # NaNs in lat/lon and dynamic feature + (2, 2, 55, None, 0, {}, False, True, False, True, [0.5], [4.1, 5.65]), # NaNs in lat/lon and static feature + (2, 2, 55, None, 0, {}, True, True, False, True, [0.5], [4.1, 4.6]), # NaNs in lat/lon, dynamic and static feature + (2, 2, 55, None, 0, {}, True, True, True, True, [0.5], [4.6, 4.6]), # NaNs in lat/lon, dynamic and static feature and input data ], ) +# fmt: on def test_prepare_and_apply_qrf( percentile_input, site_id, @@ -197,7 +87,6 @@ def test_prepare_and_apply_qrf( include_latlon_nans, quantiles, expected, - expected2, ): """Test the PrepareAndApplyQRF plugin.""" feature_config = {"wind_speed_at_10m": ["mean", "std", "latitude", "longitude"]} @@ -284,18 +173,10 @@ def test_prepare_and_apply_qrf( "wind_speed_at_10m", unique_site_id_keys=site_id, )(cube_inputs, (qrf_model, transformation, pre_transform_addition)) - assert isinstance(result, Cube) + assert isinstance(result, Cube) assert result.data.shape == (len(quantiles), 2) - - if ( - expected2 - and include_latlon_nans - and site_id == ["latitude", "longitude", "altitude"] - ): - assert np.allclose(result.data, expected2, rtol=1e-2) - else: - assert np.allclose(result.data, expected, rtol=1e-2) + assert np.allclose(result.data, expected, rtol=1e-2) # Check that the metadata is as expected assert result.name() == "wind_speed_at_10m" diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py index 62e9e4a865..d303f6a854 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py @@ -243,24 +243,29 @@ def test_quantile_forest_package_available(): @pytest.mark.parametrize( "feature_name,expected,expected_dtype", [ - ("mean", np.tile(np.array([6, 8], dtype=np.float32), 3), np.float32), + ("mean", np.tile([6, 8], 3).astype(np.float32), np.float32), ("std", np.repeat(4, 6).astype(np.float32), np.float32), - ("latitude", np.tile(np.array([50, 60], dtype=np.float32), 3), np.float32), - ("longitude", np.tile(np.array([0, 10], dtype=np.float32), 3), np.float32), - ("altitude", np.tile(np.array([10, 20], dtype=np.float32), 3), np.float32), + ("min", np.tile([2, 4], 3).astype(np.float32), np.float32), + ("max", np.tile([10, 12], 3).astype(np.float32), np.float32), + ("percentile_50", np.tile([6, 8], 3).astype(np.float32), np.float32), + ("members_below_5", np.repeat(1, 6).astype(np.float32), np.float32), + ("members_above_5", np.repeat(2, 6).astype(np.float32), np.float32), + ("latitude", np.tile([50, 60], 3).astype(np.float32), np.float32), + ("longitude", np.tile([0, 10], 3).astype(np.float32), np.float32), + ("altitude", np.tile([10, 20], 3).astype(np.float32), np.float32), ( "day_of_year", - np.tile(np.array([1, 1], dtype=np.int32), 3), + np.tile([1, 1], 3).astype(np.int32), np.int32, ), ( "day_of_year_sin", - np.tile(np.array([0.01716633, 0.01716633], dtype=np.float32), 3), + np.tile([0.01716633, 0.01716633], 3).astype(np.float32), np.float32, ), ( "day_of_year_cos", - np.tile(np.array([0.99985266, 0.99985266], dtype=np.float32), 3), + np.tile([0.99985266, 0.99985266], 3).astype(np.float32), np.float32, ), ("hour_of_day", np.repeat(12, 6).astype(np.int32), np.int32), @@ -268,7 +273,7 @@ def test_quantile_forest_package_available(): ("hour_of_day_cos", np.repeat(-1, 6).astype(np.float32), np.float32), ("forecast_period", np.repeat(43200, 6).astype(np.int32), np.int32), ("day_of_training_period", np.repeat(0, 6).astype(np.int32), np.int32), - ("static", np.tile(np.array([2, 3], dtype=np.float32), 3), np.float32), + ("static", np.tile([2, 3], 3).astype(np.float32), np.float32), ], ) def test_prep_feature_single_time( @@ -295,7 +300,15 @@ def test_prep_feature_single_time( result = prep_feature(forecast_df, variable_name, feature_name) - if feature_name in ["mean", "std"]: + if feature_name in [ + "mean", + "std", + "min", + "max", + "percentile_50", + "members_below_5", + "members_above_5", + ]: assert result.shape == (6, 13) variable_name_modified = f"{variable_name}_{feature_name}" elif feature_name in [ @@ -348,6 +361,11 @@ def test_prep_feature_invalid_percentiles(scenario): [ ("mean", np.tile([6, 8], 18).astype(np.float32), np.float32), ("std", np.repeat(4, 36).astype(np.float32), np.float32), + ("min", np.tile([2, 4], 18).astype(np.float32), np.float32), + ("max", np.tile(np.array([10, 12], dtype=np.float32), 18), np.float32), + ("percentile_50", np.tile(np.array([6, 8], dtype=np.float32), 18), np.float32), + ("members_below_5", np.repeat(1, 36).astype(np.float32), np.float32), + ("members_above_5", np.repeat(2, 36).astype(np.float32), np.float32), ("latitude", np.tile([50, 60], 18).astype(np.float32), np.float32), ("longitude", np.tile([0, 10], 18).astype(np.float32), np.float32), ("altitude", np.tile([10, 20], 18).astype(np.float32), np.float32), @@ -428,7 +446,15 @@ def test_prep_feature_more_times(feature_name, expected, expected_dtype): result = prep_feature(forecast_df, variable_name, feature_name) - if feature_name in ["mean", "std"]: + if feature_name in [ + "mean", + "std", + "min", + "max", + "percentile_50", + "members_below_5", + "members_above_5", + ]: assert result.shape == (36, 13) variable_name_modified = f"{variable_name}_{feature_name}" elif feature_name in [ From 1f0da55f14d399e2bc8c5283c07de07ea33de458 Mon Sep 17 00:00:00 2001 From: Gavin Evans Date: Mon, 22 Sep 2025 09:36:07 +0100 Subject: [PATCH 15/15] Update following review comments. --- ...ad_and_apply_quantile_regression_random_forest.py | 6 ++++-- ...ad_and_train_quantile_regression_random_forest.py | 2 +- .../cli/apply_quantile_regression_random_forest.py | 3 +++ improver_tests/acceptance/acceptance.py | 12 ++++++------ 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index 09633c5d3a..ee3f0d3f29 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -35,7 +35,8 @@ class RandomForestQuantileRegressor: class PrepareAndApplyQRF(PostProcessingPlugin): - """Load and apply the trained Quantile Regression Random Forest (QRF) model.""" + """Prepare the input forecast for application of a trained Quantile Regression + Random Forest (QRF) model and apply the QRF model.""" def __init__( self, @@ -202,7 +203,8 @@ def process( calibrated. qrf_descriptors: The trained QRF model to be applied to the forecast and the transformation and pre-transform addition applied during - training. + training. The descriptors expected are a tuple of: + (qrf_model, transformation, pre_transform_addition). Returns: iris.cube.Cube: diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 550bf8c3a1..3ac13fbabc 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -52,7 +52,7 @@ def __init__( experiment: Optional[str] = None, unique_site_id_keys: Union[list[str], str] = "wmo_id", ): - """Initialise the LoadForQRF plugin. + """Initialise the LoadForTrainQRF plugin. Args: feature_config: Feature configuration defining the features to be used for diff --git a/improver/cli/apply_quantile_regression_random_forest.py b/improver/cli/apply_quantile_regression_random_forest.py index 488d49f54d..cb0bd00407 100644 --- a/improver/cli/apply_quantile_regression_random_forest.py +++ b/improver/cli/apply_quantile_regression_random_forest.py @@ -52,6 +52,9 @@ def process( the rest of the feature cubes, if present. The names of the coordinates that uniquely identify each site, e.g. "wmo_id" or "latitude,longitude". + unique_site_id_keys (str): + The names of the coordinates that uniquely identify each site, + e.g. "wmo_id" or "latitude,longitude". Returns: iris.cube.Cube: The calibrated forecast cube. diff --git a/improver_tests/acceptance/acceptance.py b/improver_tests/acceptance/acceptance.py index d69a9560bb..3616f225be 100644 --- a/improver_tests/acceptance/acceptance.py +++ b/improver_tests/acceptance/acceptance.py @@ -277,12 +277,12 @@ def recreate_if_needed(output_path, kgo_path, recreate_dir_path=None): print("Original KGO file does not exist") kgo_relative = kgo_path.relative_to(kgo_root_dir) recreate_file_path = recreate_dir_path / kgo_relative - # if recreate_file_path == kgo_path: - # err = ( - # f"Recreate KGO path {recreate_file_path} must be different from" - # f" original KGO path {kgo_path} to avoid overwriting" - # ) - # raise IOError(err) + if recreate_file_path == kgo_path: + err = ( + f"Recreate KGO path {recreate_file_path} must be different from" + f" original KGO path {kgo_path} to avoid overwriting" + ) + raise IOError(err) recreate_file_path.parent.mkdir(exist_ok=True, parents=True) if recreate_file_path.exists(): recreate_file_path.unlink()