diff --git a/src/linkml_map/cli/cli.py b/src/linkml_map/cli/cli.py index b118c99a..599c3750 100644 --- a/src/linkml_map/cli/cli.py +++ b/src/linkml_map/cli/cli.py @@ -395,8 +395,8 @@ def _map_data_streaming( if emit_spec: _emit_spec_to_file(tr, emit_spec) - # Initialize data loader - data_loader = DataLoader(input_path) + # Initialize data loader (schema enables type-preserving coercion for TSV/CSV) + data_loader = DataLoader(input_path, schemaview=tr.source_schemaview) # Set up error collection when continue-on-error is enabled errors: list[TransformationError] = [] diff --git a/src/linkml_map/loaders/data_loaders.py b/src/linkml_map/loaders/data_loaders.py index 42d542e5..ac24f830 100644 --- a/src/linkml_map/loaders/data_loaders.py +++ b/src/linkml_map/loaders/data_loaders.py @@ -8,6 +8,7 @@ from typing import Any import yaml +from linkml_runtime import SchemaView class FileFormat(str, Enum): @@ -83,16 +84,25 @@ def __init__( self, source: str | Path, skip_empty_rows: bool = True, + schema_path: str | Path | None = None, + target_class: str | None = None, ) -> None: """Initialize TSV loader.""" super().__init__(source) self.skip_empty_rows = skip_empty_rows + self.schema_path = schema_path + self.target_class = target_class def iter_instances(self) -> Iterator[dict[str, Any]]: """Iterate over rows from the TSV file.""" from linkml.validator.loaders import TsvLoader - loader = TsvLoader(str(self.source), skip_empty_rows=self.skip_empty_rows) + loader = TsvLoader( + str(self.source), + skip_empty_rows=self.skip_empty_rows, + schema_path=self.schema_path, + target_class=self.target_class, + ) yield from loader.iter_instances() @@ -103,16 +113,25 @@ def __init__( self, source: str | Path, skip_empty_rows: bool = True, + schema_path: str | Path | None = None, + target_class: str | None = None, ) -> None: """Initialize CSV loader.""" super().__init__(source) self.skip_empty_rows = skip_empty_rows + self.schema_path = schema_path + self.target_class = target_class def iter_instances(self) -> Iterator[dict[str, Any]]: """Iterate over rows from the CSV file.""" from linkml.validator.loaders import CsvLoader - loader = CsvLoader(str(self.source), skip_empty_rows=self.skip_empty_rows) + loader = CsvLoader( + str(self.source), + skip_empty_rows=self.skip_empty_rows, + schema_path=self.schema_path, + target_class=self.target_class, + ) yield from loader.iter_instances() @@ -175,6 +194,7 @@ def __init__( base_path: str | Path, default_format: FileFormat | None = None, skip_empty_rows: bool = True, + schemaview: SchemaView | None = None, ) -> None: """ Initialize the data loader. @@ -182,6 +202,8 @@ def __init__( :param base_path: Base directory containing data files, or a single file path :param default_format: Default format to use when extension is ambiguous :param skip_empty_rows: Skip empty rows in tabular files (default: True) + :param schemaview: Source schema (enables schema-aware type coercion for TSV/CSV). + The target class is derived from each file's identifier. :raises FileNotFoundError: If the path does not exist """ self.base_path = Path(base_path) @@ -190,6 +212,26 @@ def __init__( raise FileNotFoundError(msg) self.default_format = default_format self.skip_empty_rows = skip_empty_rows + self.schemaview = schemaview + + def _schema_loader_kwargs(self, identifier: str) -> dict[str, Any]: + """ + Build schema-aware kwargs for a TSV/CSV leaf loader. + + linkml's delimited loader currently takes a ``schema_path``, so we bridge + the in-scope :class:`SchemaView` to its source file. When that loader gains + native ``SchemaView`` support, this is the single spot that changes. + + :param identifier: Names the source class the file's rows conform to. + :return: ``schema_path``/``target_class`` kwargs, or empty if no schema is + available (in-memory schemas with no source file degrade to no coercion). + """ + if self.schemaview is None: + return {} + schema_path = self.schemaview.schema.source_file + if schema_path is None: + return {} + return {"schema_path": schema_path, "target_class": identifier} @property def is_single_file(self) -> bool: @@ -281,6 +323,7 @@ def __getitem__(self, identifier: str) -> Iterator[dict[str, Any]]: file_format = FileFormat.from_extension(file_path) if file_format in (FileFormat.TSV, FileFormat.CSV): loader_kwargs["skip_empty_rows"] = self.skip_empty_rows + loader_kwargs.update(self._schema_loader_kwargs(identifier)) loader = get_file_loader(file_path, **loader_kwargs) return loader.iter_instances() @@ -295,6 +338,8 @@ def __iter__(self) -> Iterator[dict[str, Any]]: file_format = FileFormat.from_extension(self.base_path) if file_format in (FileFormat.TSV, FileFormat.CSV): loader_kwargs["skip_empty_rows"] = self.skip_empty_rows + # Single-file mode: the file stem names the source class. + loader_kwargs.update(self._schema_loader_kwargs(self.base_path.stem)) loader = get_file_loader(self.base_path, **loader_kwargs) yield from loader.iter_instances() diff --git a/tests/test_cli/test_cli_tabular.py b/tests/test_cli/test_cli_tabular.py index 6dcf42fb..4537aa96 100644 --- a/tests/test_cli/test_cli_tabular.py +++ b/tests/test_cli/test_cli_tabular.py @@ -152,6 +152,41 @@ def test_tsv_input_jsonl_output( assert "id" in obj assert "label" in obj + def test_tsv_string_id_not_numerically_coerced( + self, + runner: CliRunner, + tmp_path: Path, + sample_schema: Path, + sample_transform: Path, + ) -> None: + """A numeric-looking string id keeps its leading zeros end-to-end. + + Without schema-aware loading, pandas-style inference coerces "00123" + to the int 123, losing the leading zeros and breaking downstream lookups. + """ + tsv_path = tmp_path / "Person.tsv" + tsv_path.write_text( + "id\tname\tprimary_email\tage_in_years\tgender\n00123\tAlice\talice@example.com\t30\tcisgender woman\n" + ) + result = runner.invoke( + main, + [ + "map-data", + "-T", + str(sample_transform), + "-s", + str(sample_schema), + "--source-type", + "Person", + "-f", + "jsonl", + str(tsv_path), + ], + ) + assert result.exit_code == 0 + obj = json.loads(result.stdout.strip()) + assert obj["id"] == "00123" + def test_tsv_input_tsv_output( self, runner: CliRunner, diff --git a/tests/test_loaders/test_data_loader.py b/tests/test_loaders/test_data_loader.py index 36e79439..9c18ab51 100644 --- a/tests/test_loaders/test_data_loader.py +++ b/tests/test_loaders/test_data_loader.py @@ -5,8 +5,37 @@ import pytest import yaml +from linkml_runtime import SchemaView from linkml_map.loaders import DataLoader, FileFormat, load_data_file +from linkml_map.loaders.data_loaders import CsvFileLoader, TsvFileLoader, get_file_loader + +SCHEMA_WITH_ENUM = { + "id": "https://example.org/test", + "name": "test", + "prefixes": {"linkml": "https://w3id.org/linkml/"}, + "imports": ["linkml:types"], + "default_range": "string", + "classes": { + "Record": { + "attributes": { + "id": {"range": "integer", "identifier": True}, + "zipcode": {"range": "string"}, + "score": {"range": "score_enum"}, + "weight": {"range": "float"}, + } + } + }, + "enums": { + "score_enum": { + "permissible_values": { + "1": {"description": "Low"}, + "2": {"description": "Medium"}, + "3": {"description": "High"}, + } + } + }, +} @pytest.fixture @@ -313,3 +342,152 @@ def test_skip_empty_rows_false(self, tmp_path: Path) -> None: assert rows[1]["id"] == "P:002" # Empty string values are not included in the dict by linkml's loader assert "name" not in rows[1] or rows[1].get("name") == "" + + +# --- Schema-aware loading tests --- +# These verify that schema_path/target_class flow through to the underlying +# linkml loader so that string-ranged and enum-ranged columns are not +# coerced to int/float. + + +@pytest.fixture() +def schema_file(tmp_path: Path) -> Path: + """Write the test schema to a YAML file and return its path.""" + path = tmp_path / "schema.yaml" + path.write_text(yaml.dump(SCHEMA_WITH_ENUM)) + return path + + +@pytest.fixture() +def schema_aware_tsv(tmp_path: Path) -> Path: + """TSV with numeric-looking values in string and enum columns.""" + path = tmp_path / "Record.tsv" + path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n") + return path + + +@pytest.fixture() +def schema_aware_csv(tmp_path: Path) -> Path: + """CSV with numeric-looking values in string and enum columns.""" + path = tmp_path / "Record.csv" + path.write_text("id,zipcode,score,weight\n1,90210,2,3.5\n") + return path + + +def _assert_schema_aware_row(row: dict) -> None: + """Shared assertions for schema-aware loading: string/enum columns stay strings.""" + assert row["id"] == 1 + assert isinstance(row["id"], int) + assert row["zipcode"] == "90210" + assert isinstance(row["zipcode"], str) + assert row["score"] == "2" + assert isinstance(row["score"], str) + assert row["weight"] == 3.5 + assert isinstance(row["weight"], float) + + +class TestSchemaAwareTsvFileLoader: + """TsvFileLoader preserves string/enum columns when given a schema.""" + + def test_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None: + loader = TsvFileLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record") + row = next(loader.iter_instances()) + _assert_schema_aware_row(row) + + def test_without_schema_coerces(self, schema_aware_tsv: Path) -> None: + loader = TsvFileLoader(schema_aware_tsv) + row = next(loader.iter_instances()) + assert isinstance(row["zipcode"], int) + assert isinstance(row["score"], int) + + +class TestSchemaAwareCsvFileLoader: + """CsvFileLoader preserves string/enum columns when given a schema.""" + + def test_with_schema(self, schema_aware_csv: Path, schema_file: Path) -> None: + loader = CsvFileLoader(schema_aware_csv, schema_path=schema_file, target_class="Record") + row = next(loader.iter_instances()) + _assert_schema_aware_row(row) + + def test_without_schema_coerces(self, schema_aware_csv: Path) -> None: + loader = CsvFileLoader(schema_aware_csv) + row = next(loader.iter_instances()) + assert isinstance(row["zipcode"], int) + assert isinstance(row["score"], int) + + +class TestSchemaAwareGetFileLoader: + """get_file_loader forwards schema params to TSV/CSV loaders.""" + + @pytest.mark.parametrize("fixture_name", ["schema_aware_tsv", "schema_aware_csv"]) + def test_with_schema(self, fixture_name: str, schema_file: Path, request: pytest.FixtureRequest) -> None: + data_file = request.getfixturevalue(fixture_name) + loader = get_file_loader(data_file, schema_path=schema_file, target_class="Record") + row = next(loader.iter_instances()) + _assert_schema_aware_row(row) + + def test_rejected_for_yaml(self, tmp_path: Path, schema_file: Path) -> None: + """schema_path/target_class are not valid kwargs for non-tabular loaders.""" + yaml_path = tmp_path / "data.yaml" + yaml_path.write_text(yaml.dump({"id": 1, "zipcode": "90210"})) + with pytest.raises(TypeError, match="unexpected keyword argument"): + get_file_loader(yaml_path, schema_path=schema_file, target_class="Record") + + +class TestSchemaAwareDataLoader: + """DataLoader forwards schema params through to underlying loaders.""" + + def test_single_file_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None: + # Single-file mode derives target_class from the file stem ("Record"). + loader = DataLoader(schema_aware_tsv, schemaview=SchemaView(str(schema_file))) + row = next(iter(loader)) + _assert_schema_aware_row(row) + + def test_directory_with_schema(self, tmp_path: Path, schema_file: Path) -> None: + tsv_path = tmp_path / "Record.tsv" + tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n") + # Directory mode derives target_class from the identifier ("Record"). + loader = DataLoader(tmp_path, schemaview=SchemaView(str(schema_file))) + row = next(loader["Record"]) + _assert_schema_aware_row(row) + + def test_directory_without_schema_coerces(self, tmp_path: Path) -> None: + tsv_path = tmp_path / "Record.tsv" + tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n") + loader = DataLoader(tmp_path) + row = next(loader["Record"]) + assert isinstance(row["zipcode"], int) + + def test_iter_sources_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None: + loader = DataLoader(schema_aware_tsv, schemaview=SchemaView(str(schema_file))) + sources = list(loader.iter_sources()) + assert len(sources) == 1 + _, rows = sources[0] + row = next(rows) + _assert_schema_aware_row(row) + + def test_directory_derives_target_class_per_identifier(self, tmp_path: Path) -> None: + """Each file's target_class is derived from its identifier, so the same + column name is coerced differently depending on its class's schema range.""" + schema = { + "id": "https://example.org/multi", + "name": "multi", + "prefixes": {"linkml": "https://w3id.org/linkml/"}, + "imports": ["linkml:types"], + "default_range": "string", + "classes": { + "Coded": {"attributes": {"code": {"range": "string"}}}, + "Numbered": {"attributes": {"code": {"range": "integer"}}}, + }, + } + schema_path = tmp_path / "multi.yaml" + schema_path.write_text(yaml.dump(schema)) + (tmp_path / "Coded.tsv").write_text("code\n007\n") + (tmp_path / "Numbered.tsv").write_text("code\n007\n") + + loader = DataLoader(tmp_path, schemaview=SchemaView(str(schema_path))) + coded = next(loader["Coded"]) + numbered = next(loader["Numbered"]) + + assert coded["code"] == "007" # string range preserved + assert numbered["code"] == 7 # integer range coerced