feat: validate minimum records in dataset file

Sangamesh26 · Sangamesh26 · commit fad488c36c93 · 2025-06-28T09:22:22.000+05:30
diff --git a/src/llama_prompt_ops/interfaces/cli.py b/src/llama_prompt_ops/interfaces/cli.py
@@ -430,6 +430,14 @@ def get_dataset_adapter_from_config(config_dict, config_path):
     return get_dataset_adapter(config_dict)
 
 
+def validate_min_records_in_dataset(dataset_adapter: DatasetAdapter):
+    # The dataset must contain at least 4 records to avoid runtime errors during optimization.
+    # This is because the data is split into 25% training, 25% validation, and 50% testing.
+    data = dataset_adapter.load_raw_data()
+    if len(data) < 4:
+        raise ValueError("Dataset must contain at least 4 records")
+
+
 def get_models_from_config(config_dict, override_model_name=None, api_key=None):
     """
     Create model adapter instances from configuration.
@@ -791,6 +799,13 @@ def migrate(config, model, output_dir, save_yaml, api_key_env, dotenv_path, log_
     except ValueError as e:
         click.echo(f"Error: {str(e)}", err=True)
         sys.exit(1)
+    
+    # Validate the minimum number of records in dataset
+    try:
+        validate_min_records_in_dataset(dataset_adapter)
+    except ValueError as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
 
     # Create strategy based on config
     strategy = get_strategy(
diff --git a/tests/integration/test_cli_integration.py b/tests/integration/test_cli_integration.py
@@ -131,6 +131,7 @@ def test_cli_migrate_command(self, mock_api_key_check, temp_config_file):
                 "llama_prompt_ops.interfaces.cli.get_strategy", return_value=MagicMock()
             ),
             patch("llama_prompt_ops.interfaces.cli.load_config", return_value={}),
+            patch("llama_prompt_ops.interfaces.cli.validate_min_records_in_dataset", return_value=None),
         ):
 
             # Run the migrate command
@@ -194,6 +195,7 @@ def test_cli_config_loading(self, mock_api_key_check, facility_config_path):
                 "llama_prompt_ops.interfaces.cli.get_strategy", return_value=MagicMock()
             ),
             patch("llama_prompt_ops.interfaces.cli.load_config", return_value={}),
+            patch("llama_prompt_ops.interfaces.cli.validate_min_records_in_dataset", return_value=None),
         ):
 
             # Run the migrate command with the real config
@@ -268,6 +270,7 @@ def test_end_to_end_cli_flow(self, mock_api_key_check, temp_config_file):
                     return_value=MagicMock(),
                 ),
                 patch("llama_prompt_ops.interfaces.cli.load_config", return_value={}),
+                patch("llama_prompt_ops.interfaces.cli.validate_min_records_in_dataset", return_value=None),
             ):
 
                 # Run the migrate command with the actual file output
diff --git a/tests/unit/test_datasets.py b/tests/unit/test_datasets.py
@@ -163,3 +163,22 @@ def test_custom_split_ratios(mock_dataset_adapter):
     assert len(train) == 70
     assert len(val) == 20
     assert len(test) == 10
+
+
+def test_minimum_records_in_dataset(simple_data_file):
+    try:
+        from llama_prompt_ops.interfaces.cli import validate_min_records_in_dataset
+    except ImportError as e:
+        pytest.skip(f"Skipping test because module import failed: {str(e)}")
+
+    # Sample data file has just 2 records
+    temp_file, _ = simple_data_file
+
+    dataset_adapter = ConfigurableJSONAdapter(
+        dataset_path=temp_file.name,
+        input_field="question",
+        golden_output_field="answer",
+    )
+
+    with pytest.raises(ValueError, match="Dataset must contain at least 4 records"):
+        validate_min_records_in_dataset(dataset_adapter)