diff --git a/improver/calibration/__init__.py b/improver/calibration/__init__.py index 75e3f99706..054a2d4b2b 100644 --- a/improver/calibration/__init__.py +++ b/improver/calibration/__init__.py @@ -268,24 +268,27 @@ def split_forecasts_and_bias_files(cubes: CubeList) -> Tuple[Cube, Optional[Cube return forecast_cube, bias_cubes -def split_pickle_parquet_and_netcdf(files): +def split_pickle_parquet_and_netcdf( + files: List[Path], +) -> Tuple[Optional[CubeList], Optional[List[Path]], Optional[object]]: """Split the input files into pickle, parquet, and netcdf files. - Only a single pickle file is expected. + Any or all of NetCDF, Parquet, and pickle files can be loaded. Only a single + pickle file is expected, but multiple netCDF and parquet files can be provided. Args: files: - A list of input file paths which will be split into pickle, - parquet, and netcdf files. + A list of input file paths. Returns: - - A flattened cube list containing all the cubes contained within the - provided paths to NetCDF files. - - A list of paths to Parquet files. - - A loaded pickle file. + - A flattened cube list containing all the cubes loaded from NetCDF files, or None. + - A list of paths to Parquet files, or None. + - A loaded pickle file, or None. Raises: + ValueError: If the path provided is not loadable as a pickle file, parquet file + or netcdf file. ValueError: If multiple pickle files provided, as only one is ever expected. """ cubes = iris.cube.CubeList() - loaded_pickles = [] + loaded_pickle = None parquets = [] for file_path in files: @@ -301,24 +304,25 @@ def split_pickle_parquet_and_netcdf(files): cube = iris.load(file_path) cubes.extend(cube) except ValueError: + if loaded_pickle is not None: + msg = "Multiple pickle inputs have been provided. Only one is expected." + raise ValueError(msg) try: - loaded_pickles.append(joblib.load(file_path)) + loaded_pickle = joblib.load(file_path) except Exception as e: msg = f"Failed to load {file_path}: {e}" raise ValueError(msg) - if len(loaded_pickles) > 1: - msg = "Multiple pickle inputs have been provided. Only one is expected." - raise ValueError(msg) - return ( cubes if cubes else None, parquets if parquets else None, - loaded_pickles[0] if loaded_pickles else None, + loaded_pickle if loaded_pickle else None, ) -def identify_parquet_type(parquet_paths: List[Path]): +def identify_parquet_type( + parquet_paths: List[Path], +) -> Tuple[Optional[Path], Optional[Path]]: """Determine whether the provided parquet paths contain forecast or truth data. This is done by checking the columns within the parquet files for the presence of a forecast_period column which is only present for forecast data. diff --git a/improver/calibration/load_and_apply_quantile_regression_random_forest.py b/improver/calibration/load_and_apply_quantile_regression_random_forest.py index ee3f0d3f29..5ef43a31ec 100644 --- a/improver/calibration/load_and_apply_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_apply_quantile_regression_random_forest.py @@ -6,6 +6,7 @@ """Script to load and apply the trained Quantile Regression Random Forest (QRF) model.""" +import warnings from typing import Optional import iris @@ -70,7 +71,7 @@ def __init__( if present. unique_site_id_keys (list): The names of the coordinates that uniquely identify each site, - e.g. "wmo_id" or "latitude,longitude". + e.g. "wmo_id" or ["latitude", "longitude"]. """ self.feature_config = feature_config self.target_cf_name = target_cf_name @@ -83,18 +84,22 @@ def _get_inputs( qrf_model: Optional[RandomForestQuantileRegressor] = None, ) -> tuple[CubeList, Cube]: """Split the forecast to be calibrated from the other features. Handle - the case where the qrf_model is not provided. In this case, the uncalibrated - forecast is returned with a warning comment added. + the case where the qrf_model is not provided, for example, if the input + data required to train the QRF model isn't yet available. In this case, + the uncalibrated forecast is returned with a warning comment added. Args: cube_inputs: List of cubes containing the features and the forecast to be calibrated. + qrf_model: The trained QRF model to be applied to the forecast. If None, + the input forecast will be returned unchanged with a warning comment + added. Returns: CubeList of the features cubes and the forecast cube. Raises: - ValueError: If not target forecast is provided. + ValueError: If the target forecast is not provided. ValueError: If the number of cubes provided does not match the number of features expected. """ @@ -136,7 +141,7 @@ def _get_inputs( @staticmethod def _compute_quantile_list(forecast_cube: Cube, coord: str) -> list[float]: """Compute the list of quantiles e.g. 0.25, 0.5, 0.75 that will be produced - by using the forecast cube. + from a specified coordinate on the forecast cube. Args: forecast_cube: Forecast to be calibrated. @@ -193,10 +198,10 @@ def process( tuple[RandomForestQuantileRegressor, str, float] ] = None, ) -> Cube: - """Load and applying the trained Quantile Regression Random Forest (QRF) model. - The model is applied to the forecast supplied to calibrate the forecast. - The calibrated forecast is written to a cube. If no model is provided the - input forecast is returned unchanged. + """Load and apply the trained Quantile Regression Random Forest (QRF) model. + The model is used to calibrated the forecast provided. The calibrated forecast + is written to a cube. If no model is provided the input forecast is returned + unchanged. Args: cube_inputs: List of cubes containing the features and the forecast to be @@ -222,6 +227,13 @@ def process( assert_spatial_coords_match(cube_inputs) if not self.quantile_forest_installed or not qrf_model: + msg = "Unable to apply Quantile Regression Random Forest model." + if not self.quantile_forest_installed: + msg += " The 'quantile_forest' package is not installed." + elif not qrf_model: + msg += " No trained model has been provided." + msg += " Returning the input forecast without calibration." + warnings.warn(msg) return forecast_cube template_forecast_cube = forecast_cube.copy() diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index 81626d9154..8c6ff23683 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -6,6 +6,7 @@ (QRF).""" import pathlib +import warnings from pathlib import Path from typing import Optional, Union @@ -58,9 +59,10 @@ def __init__( feature_config: Feature configuration defining the features to be used for Quantile Regression Random Forests. parquet_diagnostic_names: - A string containing the diagnostic name that will be used for filtering - the target diagnostic from the forecast and truth DataFrames read in - from the parquet files. This could be different from the CF name e.g. + A list containing the diagnostic names that will be used for filtering + the forecast and truth DataFrames read in from the parquet files. The + target diagnostic name is expected to be the first item in the list. + These names could be different from the CF name e.g. 'temperature_at_screen_level'. target_cf_name: A string containing the CF name of the forecast to be calibrated e.g. air_temperature. @@ -71,6 +73,8 @@ def __init__( YYYYMMDDTHHMMZ. training_length: The number of days of training data to use. experiment: The name of the experiment (step) that calibration is applied to. + experiment: The name of the experiment (step) that calibration is + applied to. This is used to filter the forecast DataFrame on load. unique_site_id_key: The names of the coordinates that uniquely identify each site, e.g. "wmo_id" or "latitude,longitude". """ @@ -104,9 +108,10 @@ def _parse_forecast_periods(self) -> list[int]: forecast_periods = [int(self.forecast_periods) * 3600] except ValueError: msg = ( - "The forecast_periods argument must be a single integer or " - "a range in the form 'start:end:interval'. The forecast period" - f"provided was: {self.forecast_periods}." + "The forecast_periods argument must be a single integer after " + "extraction from the string input, or a range in the form " + "'start:end:interval'. The forecast period provided was: " + f"{self.forecast_periods}." ) raise ValueError(msg) return forecast_periods @@ -177,9 +182,11 @@ def _read_parquet_files( schema=altered_schema, engine="pyarrow", ) - + seconds_to_ns = 1e9 forecast_df = forecast_df[ - forecast_df["forecast_period"].isin(np.array(forecast_periods) * 1e9) + forecast_df["forecast_period"].isin( + np.array(forecast_periods) * seconds_to_ns + ) ].reset_index(drop=True) # Convert df columns from ns to pandas timestamp object. @@ -261,21 +268,42 @@ def process( Parquet file are: ob_value, time, wmo_id, diagnostic, latitude, longitude and altitude. - The path to a Parquet file containing the forecasts to be used - for calibration. + for calibration. The expected columns within the Parquet file are: + forecast, blend_time, forecast_period, forecast_reference_time, time, + wmo_id, percentile, diagnostic, latitude, longitude, period, height, + cf_name, units. Please note that the presence of a forecast_period + column is used to separate the forecast parquet file from the truth + parquet file. - Optionally, paths to NetCDF files containing additional predictors. + + Returns: + Tuple containing: + - DataFrame containing the forecast data. + - DataFrame containing the truth data. + - List of cubes containing additional features. + + A tuple of (None, None, None) is returned if: + - The quantile_forest package is not installed. + - No parquet files are provided. + - Either the forecast or truth parquet files are missing. + """ if not self.quantile_forest_installed: - return None + return None, None, None cube_inputs, parquets, _ = split_pickle_parquet_and_netcdf(file_paths) # If there are no parquet files, return None. if not parquets: + msg = "No parquet files have been provided." + warnings.warn(msg) return None, None, None forecast_table_path, truth_table_path = identify_parquet_type(parquets) # If either the forecast or truth parquet files are missing, return None. if not forecast_table_path or not truth_table_path: + msg = "Both forecast and truth parquet files must be provided." + warnings.warn(msg) return None, None, None forecast_periods = self._parse_forecast_periods() @@ -315,6 +343,7 @@ def __init__( transformation: Optional[str] = None, pre_transform_addition: float = 0, unique_site_id_keys: Union[list[str], str] = "wmo_id", + **kwargs, ): """Initialise the PrepareAndTrainQRF plugin. @@ -331,7 +360,8 @@ def __init__( transformation: Transformation to be applied to the data before fitting. pre_transform_addition: Value to be added before transformation. unique_site_id_key: The names of the coordinates that uniquely identify - each site, e.g. "wmo_id" or "latitude,longitude". + each site, e.g. "wmo_id" or ["latitude", "longitude"]. + kwargs: Additional keyword arguments for the quantile regression model. """ self.feature_config = feature_config self.target_cf_name = target_cf_name @@ -344,6 +374,7 @@ def __init__( if isinstance(unique_site_id_keys, str): unique_site_id_keys = [unique_site_id_keys] self.unique_site_id_keys = unique_site_id_keys + self.kwargs = kwargs self.quantile_forest_installed = quantile_forest_package_available() @staticmethod @@ -446,16 +477,26 @@ def process( truth_df: DataFrame containing the truth data. cube_inputs: List of cubes containing additional features. + Returns: A tuple containing: + - The trained RandomForestQuantileRegressor model. + - The transformation applied to the data before fitting. + - The value added before transformation. + Raises: - ValueError: If the number of cubes loaded does not match the number of - features expected. + ValueError: If there are no matching times between the forecast and truth + data. """ if not self.quantile_forest_installed: - return None + return None, None, None intersecting_times = self._check_matching_times(forecast_df, truth_df) if len(intersecting_times) == 0: - return None + msg = ( + "No matching times between the forecast and truth data. " + "Unable to train the Quantile Regression Random Forest model." + ) + warnings.warn(msg) + return None, None, None forecast_df = self._add_static_features_from_cubes_to_df( forecast_df, cube_inputs @@ -472,6 +513,7 @@ def process( transformation=self.transformation, pre_transform_addition=self.pre_transform_addition, unique_site_id_keys=self.unique_site_id_keys, + **self.kwargs, )(forecast_df, truth_df) # Create a tuple that returns the model, transformation and diff --git a/improver/calibration/quantile_regression_random_forest.py b/improver/calibration/quantile_regression_random_forest.py index acb31ccff8..67cc09ebac 100644 --- a/improver/calibration/quantile_regression_random_forest.py +++ b/improver/calibration/quantile_regression_random_forest.py @@ -38,11 +38,14 @@ def prep_feature( pre_transform_addition: np.float32 = 0, unique_site_id_keys: Union[list[str], str] = "wmo_id", ) -> pd.DataFrame: - """Prepare features that require computation from the input DataFrame. Options - available are mean, standard deviation, min, max, percentiles and a members above - and a members below count of the input feature, the day of year, + """Prepare features that require computation from the input DataFrame. + + Options available are mean, standard deviation, min, max, percentiles and a + members above and a members below count of the input feature, the day of year, sine of day of year, cosine of day of year, hour of day, sine of hour of day - and cosine of hour of day. When computing the mean or standard deviation, + and cosine of hour of day. + + When computing the mean or standard deviation, these will be computed over either the percentile or realization column, depending upon which is available. When a percentile column is provided, the expectation is that these percentiles are equally spaced between 0 and 100, so that @@ -63,7 +66,7 @@ def prep_feature( pre_transform_addition: Value to be added before transformation. This is only used when computing members_below or members_above features. unique_site_id_keys: The names of the coordinates that uniquely identify - each site, e.g. "wmo_id" or "latitude,longitude". + each site, e.g. "wmo_id" or ["latitude", "longitude"]. Returns: df: DataFrame with the computed feature added. """ @@ -240,7 +243,7 @@ def prep_features_from_config( df: Input DataFrame. feature_config: Feature configuration defining the features to be used for QRF. unique_site_id_keys: The names of the coordinates that uniquely identify - each site, e.g. "wmo_id" or "latitude,longitude". + each site, e.g. "wmo_id" or ["latitude", "longitude"]. Returns: Processed DataFrame and a list of expected column names that will be used as features with the QRF. @@ -374,7 +377,7 @@ def __init__( pre_transform_addition (float): Value to be added before transformation. unique_site_id_keys: The names of the coordinates that uniquely identify - each site, e.g. "wmo_id" or "latitude,longitude". + each site, e.g. "wmo_id" or ["latitude", "longitude"]. kwargs: Additional keyword arguments for the quantile regression model. @@ -521,7 +524,7 @@ def __init__( pre_transform_addition (float): Value to be added before transformation. unique_site_id_keys: The names of the coordinates that uniquely identify - each site, e.g. "wmo_id" or "latitude,longitude". + each site, e.g. "wmo_id" or ["latitude", "longitude"]. """ self.target_name = target_name diff --git a/improver/cli/__init__.py b/improver/cli/__init__.py index 7ee8ba5b67..54ab67a4ff 100644 --- a/improver/cli/__init__.py +++ b/improver/cli/__init__.py @@ -312,7 +312,6 @@ def with_output( pass_through_output=False, compression_level=1, least_significant_digit: int = None, - output_file_type="netCDF", **kwargs, ): """Add `output` keyword only argument. diff --git a/improver/cli/train_quantile_regression_random_forest.py b/improver/cli/train_quantile_regression_random_forest.py index e018d5f630..f09a5f4153 100644 --- a/improver/cli/train_quantile_regression_random_forest.py +++ b/improver/cli/train_quantile_regression_random_forest.py @@ -22,6 +22,7 @@ def process( n_estimators: int = 100, max_depth: int = None, max_samples: float = None, + max_features: float = None, random_state: int = None, transformation: str = None, pre_transform_addition: float = 0, @@ -36,13 +37,18 @@ def process( Args: file_paths (cli.inputpaths): - A list of input paths containing: + A list of input paths (in any order) containing: - The path to a Parquet file containing the truths to be used for calibration. The expected columns within the Parquet file are: ob_value, time, wmo_id, diagnostic, latitude, longitude and altitude. - The path to a Parquet file containing the forecasts to be used - for calibration. + for calibration. The expected columns within the Parquet file are: + forecast, blend_time, forecast_period, forecast_reference_time, time, + wmo_id, percentile, diagnostic, latitude, longitude, period, height, + cf_name, units. Please note that the presence of a forecast_period + column is used to separate the forecast parquet file from the truth + parquet file. - Optionally, paths to NetCDF files containing additional preictors. feature_config (dict): Feature configuration defining the features to be used for quantile @@ -60,10 +66,11 @@ def process( "visibility_at_screen_level": ["mean", "std"] "distance_to_water": ["static"], } - parquet_diagnostic_names (str): - A string containing the diagnostic names that will be used for filtering - the target diagnostic from the forecast and truth DataFrames read in - from the parquet files. This could be different from the CF name e.g. + parquet_diagnostic_names (list of str): + A list containing the diagnostic names that will be used for filtering + the forecast and truth DataFrames read in from the parquet files. The + target diagnostic name is expected to be the first item in the list. + These names could be different from the CF name e.g. 'temperature_at_screen_level'. target_cf_name (str): A string containing the CF name of the forecast to be calibrated @@ -78,12 +85,13 @@ def process( training_length (int): The length of the training period in days. experiment (str): - The name of the experiment (step) that calibration is applied to. + The name of the experiment (step) that calibration is applied to. This + is used to filter the forecast DataFrame on load. n_estimators (int): Number of trees in the forest. max_depth (int): Maximum depth of the tree. - max_samples (float): + max_samples (int | float): If an int, then it is the number of samples to draw from the total number of samples available to train each tree. Note that a 'sample' refers to each row within the DataFrames constructed where each row will differ @@ -93,6 +101,10 @@ def process( If None, then each tree contains the same number of samples as the total available. The trees will therefore only differ due to the use of bootstrapping (i.e. sampling with replacement) when creating each tree. + max_features (int | float): + If a float, then it is the fraction of features to consider when looking + for the best split. If int, then it is the number of features that will + be considered at each split. If None, then all features are considered. random_state (int): Random seed for reproducibility. transformation (str): @@ -102,6 +114,7 @@ def process( unique_site_id_keys (str): The names of the coordinates that uniquely identify each site, e.g. "wmo_id" or "latitude,longitude". + kwargs: Additional keyword arguments for the quantile regression model. Returns: A quantile regression random forest model with associated transformation and pre-transformation addition that will be stored as a pickle file. @@ -124,6 +137,10 @@ def process( )(file_paths) if forecast_df is None or truth_df is None or cube_inputs is None: return None + + kwargs = {} + if max_features is not None: + kwargs["max_features"] = max_features result = PrepareAndTrainQRF( feature_config=feature_config, target_cf_name=target_cf_name, @@ -134,6 +151,7 @@ def process( transformation=transformation, pre_transform_addition=pre_transform_addition, unique_site_id_keys=unique_site_id_keys, + **kwargs, )(forecast_df, truth_df, cube_inputs) return result diff --git a/improver_tests/acceptance/acceptance.py b/improver_tests/acceptance/acceptance.py index 3616f225be..4ae0efc049 100644 --- a/improver_tests/acceptance/acceptance.py +++ b/improver_tests/acceptance/acceptance.py @@ -339,6 +339,10 @@ def compare( message = "" def message_recorder(exception_message): + """A callback function to record comparison failure messages. + Args: + exception_message (str): The message from the exception raised + during comparison.""" nonlocal difference_found nonlocal message difference_found = True diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py index f42ebcf4b4..f64f7aacd8 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_apply_quantile_regression_random_forest.py @@ -43,6 +43,7 @@ def _add_day_of_training_period_to_cube(cube, day_of_training_period, secondary_ return cube +# Disable ruff formatting to keep the parameter combinations aligned for readability. # fmt: off @pytest.mark.parametrize("percentile_input", [True, False]) @pytest.mark.parametrize( @@ -262,7 +263,8 @@ def test_unexpected( ) if exception == "no_model_output": - result = plugin(cube_inputs, qrf_descriptors=None) + with pytest.warns(UserWarning, match="Unable to apply Quantile Regression Random Forest model."): + result = plugin(cube_inputs, qrf_descriptors=None) assert isinstance(result, Cube) assert result.name() == "wind_speed_at_10m" assert result.units == "m s-1" @@ -300,7 +302,8 @@ def test_unexpected( elif exception == "no_quantile_forest_package": qrf_descriptors = (qrf_model, transformation, pre_transform_addition) plugin.quantile_forest_installed = False - result = plugin(CubeList([forecast_cube]), qrf_descriptors=qrf_descriptors) + with pytest.warns(UserWarning, match="Unable to apply Quantile Regression Random Forest model."): + result = plugin(CubeList([forecast_cube]), qrf_descriptors=qrf_descriptors) assert isinstance(result, Cube) assert result.name() == "wind_speed_at_10m" assert result.units == "m s-1" diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py index b2b814b33f..5b34a52ab8 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_load_and_train_quantile_regression_random_forest.py @@ -24,13 +24,14 @@ SITE_ID = ["03001", "03002", "03003", "03004", "03005"] -def _create_multi_site_forecast_parquet_file(tmp_path, representation="percentile"): +def _create_multi_site_forecast_parquet_file(tmp_path, representation=None): """Create a parquet file with multi-site forecast data. Args: tmp_path: Temporary path to save the parquet file. representation: The type of ensemble representation to use. Options are - "percentile" or "realization". + "percentile", "realization" or "kittens". "kittens" is just + used for testing that the code works with a non-standard name. """ data_dict = { @@ -52,6 +53,7 @@ def _create_multi_site_forecast_parquet_file(tmp_path, representation="percentil "height": [1.5] * 5, "diagnostic": ["temperature_at_screen_level"] * 5, } + # Other representations used for testing. if representation == "realization": data_dict["realization"] = list(range(len(data_dict["percentile"]))) data_dict.pop("percentile") @@ -84,7 +86,14 @@ def _create_multi_site_forecast_parquet_file(tmp_path, representation="percentil def _create_multi_percentile_forecast_parquet_file(tmp_path, representation=None): - """Create a parquet file with multi-percentile forecast data.""" + """Create a parquet file with multi-percentile forecast data. + + Args: + tmp_path: Temporary path to save the parquet file. + representation: The type of ensemble representation to use. Options are + "percentile" (default), "realization" or "kittens". "kittens" is just + used for testing that the code works with a non-standard name. + """ data_dict = { "percentile": [16 + 2 / 3, 33 + 1 / 3, 50, 66 + 2 / 3, 83 + 1 / 3], @@ -105,6 +114,7 @@ def _create_multi_percentile_forecast_parquet_file(tmp_path, representation=None "height": [1.5] * 5, "diagnostic": ["temperature_at_screen_level"] * 5, } + # Other representations used for testing. if representation == "realization": data_dict["realization"] = list(range(len(data_dict["percentile"]))) data_dict.pop("percentile") @@ -135,7 +145,14 @@ def _create_multi_percentile_forecast_parquet_file(tmp_path, representation=None def _create_multi_forecast_period_forecast_parquet_file(tmp_path, representation=None): - """Create a parquet file with multi-forecast period forecast data.""" + """Create a parquet file with multi-forecast period forecast data. + + Args: + tmp_path: Temporary path to save the parquet file. + representation: The type of ensemble representation to use. Options are + "percentile", "realization" or "kittens". "kittens" is just + used for testing that the code works with a non-standard name. + """ data_dict = { "percentile": [50.0, 50.0, 50.0, 50.0], @@ -165,6 +182,7 @@ def _create_multi_forecast_period_forecast_parquet_file(tmp_path, representation "height": [1.5] * 4, "diagnostic": ["temperature_at_screen_level"] * 4, } + # Other representations used for testing. if representation == "realization": data_dict["realization"] = [0, 1, 0, 1] data_dict.pop("percentile") @@ -342,6 +360,7 @@ def filter_forecast_periods(forecast_df, forecast_periods): def amend_expected_forecast_df( forecast_df, forecast_periods, parquet_diagnostic_names, representation, site_id ): + """Amend the expected forecast DataFrame to match the output of the plugin.""" forecast_df = filter_forecast_periods(forecast_df, forecast_periods) for column in ["time", "forecast_reference_time", "blend_time"]: forecast_df[column] = pd.to_datetime(forecast_df[column], unit="ns", utc=True) @@ -379,6 +398,7 @@ def amend_expected_forecast_df( def amend_expected_truth_df(truth_df, parquet_diagnostic_name): + """Amend the expected truth DataFrame to match the output of the plugin.""" truth_df = truth_df[truth_df["diagnostic"] == parquet_diagnostic_name] truth_df["time"] = pd.to_datetime(truth_df["time"], unit="ns", utc=True) return truth_df @@ -527,7 +547,13 @@ def test_load_for_qrf_no_paths(tmp_path, make_files): cycletime="20170102T0000Z", training_length=2, ) - result = plugin(file_paths) + if make_files: + msg = "Both forecast and truth parquet files must be provided." + else: + msg = "No parquet files have been provided." + + with pytest.warns(UserWarning, match=msg): + result = plugin(file_paths) # Expecting None since no valid paths are provided assert result == (None, None, None) @@ -639,7 +665,7 @@ def test_load_for_qrf_mismatches( ), ], ) -def test_unexpected( +def test_unexpected_loading( tmp_path, exception, forecast_creation, @@ -693,7 +719,7 @@ def test_unexpected( elif exception == "no_quantile_forest_package": plugin.quantile_forest_installed = False result = plugin(file_paths) - assert result is None + assert result == (None, None, None) else: raise ValueError(f"Unknown exception type: {exception}") @@ -705,6 +731,7 @@ def test_unexpected( @pytest.mark.parametrize("remove_target", [True, False]) @pytest.mark.parametrize("include_nans", [True, False]) @pytest.mark.parametrize("include_latlon_nans", [True, False]) +@pytest.mark.parametrize("add_kwargs", [True, False]) @pytest.mark.parametrize( "site_id", ["wmo_id", "station_id", ["wmo_id"], ["latitude", "longitude"]] ) @@ -742,6 +769,7 @@ def test_prepare_and_train_qrf( remove_target, include_nans, include_latlon_nans, + add_kwargs, site_id, forecast_creation, truth_creation, @@ -796,6 +824,9 @@ def test_prepare_and_train_qrf( # As latitude is not a feature, this NaN should be ignored. truth_df.loc[1, "latitude"] = pd.NA + if add_kwargs: + kwargs = {"min_samples_leaf": 2} + if feature_config == {}: pytest.skip("No features to train on") @@ -809,6 +840,7 @@ def test_prepare_and_train_qrf( transformation="log", pre_transform_addition=1, unique_site_id_keys=site_id, + **(kwargs if add_kwargs else {}), ) if truth_df["ob_value"].isna().all(): with pytest.raises(ValueError, match="Empty truth DataFrame"): @@ -848,3 +880,32 @@ def test_prepare_and_train_qrf( ) expected = 5.6 np.testing.assert_almost_equal(result, expected, decimal=1) + + +def test_unexpected_preparation( + tmp_path, +): + """Test the PrepareAndTrainQRF plugin for atypical situations.""" + feature_config = {"air_temperature": ["mean", "std", "altitude"]} + n_estimators = 2 + max_depth = 5 + random_state = 46 + target_cf_name = "air_temperature" + + _, forecast_df, _ = _create_multi_site_forecast_parquet_file(tmp_path) + _, truth_df = _create_multi_site_truth_parquet_file(tmp_path) + + truth_df.loc[:, "time"] = pd.Timestamp("2020-01-01 00:00:00", tz="utc") + + plugin = PrepareAndTrainQRF( + feature_config=feature_config, + target_cf_name=target_cf_name, + n_estimators=n_estimators, + max_depth=max_depth, + random_state=random_state, + ) + with pytest.warns( + UserWarning, match="No matching times between the forecast and truth data." + ): + result = plugin(forecast_df, truth_df) + assert result == (None, None, None) diff --git a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py index f85265993b..1dabc77b24 100644 --- a/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py +++ b/improver_tests/calibration/quantile_regression_random_forests_calibration/test_quantile_regression_random_forest.py @@ -53,6 +53,9 @@ def _create_forecasts( validity_time: Timestamp e.g. "20170101T0600Z". data: Data that will be repeated to create a cube with two sites. The length of the data will equal the number of realizations created. + representation: Either "realization" or "percentile" to define the + representation used in the DataFrame. + return_cube: If True, return a cube. If False, return a DataFrame. Returns: Forecast cube containing three percentiles and two sites. @@ -99,7 +102,7 @@ def _create_forecasts( return df -def _add_day_of_training_period(df): +def _add_day_of_training_period(df: pd.DataFrame) -> pd.DataFrame: """Add day of training period coordinate to the dataframe. Args: @@ -115,9 +118,11 @@ def _add_day_of_training_period(df): return df -def _create_ancil_file(return_cube=False): +def _create_ancil_file(return_cube: bool = False) -> Cube | pd.DataFrame: """Create an ancillary file for testing. + Args: + return_cube: If True, return a cube. If False, return a DataFrame. Returns: An ancillary DataFrame without temporal columns. """