From 8267d3a6ae32490dde416a02219f16b336674207 Mon Sep 17 00:00:00 2001 From: Sigfried Gold Date: Tue, 17 Mar 2026 14:40:19 -0400 Subject: [PATCH 1/4] Forward schema_path/target_class to linkml's delimited file loader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass through the new schema-aware loading params so that string-ranged and enum-ranged columns in TSV/CSV files are not coerced to int/float. Requires linkml >=1.11 (PR linkml/linkml#3289 added schema-awareness to the underlying _DelimitedFileLoader; released in v1.11.0). Also imports nothing new from `typing` — the new annotations use PEP 604 `X | Y | None` syntax to match the rest of the file's style. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/linkml_map/loaders/data_loaders.py | 45 +++++++- tests/test_loaders/test_data_loader.py | 154 +++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 4 deletions(-) diff --git a/src/linkml_map/loaders/data_loaders.py b/src/linkml_map/loaders/data_loaders.py index 42d542e5..63c796dd 100644 --- a/src/linkml_map/loaders/data_loaders.py +++ b/src/linkml_map/loaders/data_loaders.py @@ -83,16 +83,25 @@ def __init__( self, source: str | Path, skip_empty_rows: bool = True, + schema_path: str | Path | None = None, + target_class: str | None = None, ) -> None: """Initialize TSV loader.""" super().__init__(source) self.skip_empty_rows = skip_empty_rows + self.schema_path = schema_path + self.target_class = target_class def iter_instances(self) -> Iterator[dict[str, Any]]: """Iterate over rows from the TSV file.""" from linkml.validator.loaders import TsvLoader - loader = TsvLoader(str(self.source), skip_empty_rows=self.skip_empty_rows) + loader = TsvLoader( + str(self.source), + skip_empty_rows=self.skip_empty_rows, + schema_path=self.schema_path, + target_class=self.target_class, + ) yield from loader.iter_instances() @@ -103,22 +112,34 @@ def __init__( self, source: str | Path, skip_empty_rows: bool = True, + schema_path: str | Path | None = None, + target_class: str | None = None, ) -> None: """Initialize CSV loader.""" super().__init__(source) self.skip_empty_rows = skip_empty_rows + self.schema_path = schema_path + self.target_class = target_class def iter_instances(self) -> Iterator[dict[str, Any]]: """Iterate over rows from the CSV file.""" from linkml.validator.loaders import CsvLoader - loader = CsvLoader(str(self.source), skip_empty_rows=self.skip_empty_rows) + loader = CsvLoader( + str(self.source), + skip_empty_rows=self.skip_empty_rows, + schema_path=self.schema_path, + target_class=self.target_class, + ) yield from loader.iter_instances() def get_file_loader( path: str | Path, file_format: FileFormat | None = None, + *, + schema_path: str | Path | None = None, + target_class: str | None = None, **kwargs: Any, ) -> BaseFileLoader: """ @@ -126,6 +147,8 @@ def get_file_loader( :param path: Path to the file :param file_format: Explicit file format (auto-detected from extension if not provided) + :param schema_path: Path to the LinkML schema (enables schema-aware type coercion for TSV/CSV) + :param target_class: Target class name within the schema :param kwargs: Additional arguments passed to the loader :return: Appropriate file loader instance """ @@ -144,6 +167,10 @@ def get_file_loader( msg = f"No loader available for format: {file_format}" raise ValueError(msg) + if file_format in (FileFormat.TSV, FileFormat.CSV): + kwargs["schema_path"] = schema_path + kwargs["target_class"] = target_class + return loader_class(path, **kwargs) @@ -175,6 +202,8 @@ def __init__( base_path: str | Path, default_format: FileFormat | None = None, skip_empty_rows: bool = True, + schema_path: str | Path | None = None, + target_class: str | None = None, ) -> None: """ Initialize the data loader. @@ -182,6 +211,8 @@ def __init__( :param base_path: Base directory containing data files, or a single file path :param default_format: Default format to use when extension is ambiguous :param skip_empty_rows: Skip empty rows in tabular files (default: True) + :param schema_path: Path to the LinkML schema (enables schema-aware type coercion for TSV/CSV) + :param target_class: Target class name within the schema :raises FileNotFoundError: If the path does not exist """ self.base_path = Path(base_path) @@ -190,6 +221,8 @@ def __init__( raise FileNotFoundError(msg) self.default_format = default_format self.skip_empty_rows = skip_empty_rows + self.schema_path = schema_path + self.target_class = target_class @property def is_single_file(self) -> bool: @@ -282,7 +315,9 @@ def __getitem__(self, identifier: str) -> Iterator[dict[str, Any]]: if file_format in (FileFormat.TSV, FileFormat.CSV): loader_kwargs["skip_empty_rows"] = self.skip_empty_rows - loader = get_file_loader(file_path, **loader_kwargs) + loader = get_file_loader( + file_path, schema_path=self.schema_path, target_class=self.target_class, **loader_kwargs + ) return loader.iter_instances() def __iter__(self) -> Iterator[dict[str, Any]]: @@ -296,7 +331,9 @@ def __iter__(self) -> Iterator[dict[str, Any]]: if file_format in (FileFormat.TSV, FileFormat.CSV): loader_kwargs["skip_empty_rows"] = self.skip_empty_rows - loader = get_file_loader(self.base_path, **loader_kwargs) + loader = get_file_loader( + self.base_path, schema_path=self.schema_path, target_class=self.target_class, **loader_kwargs + ) yield from loader.iter_instances() def get_available_identifiers(self) -> list[str]: diff --git a/tests/test_loaders/test_data_loader.py b/tests/test_loaders/test_data_loader.py index 36e79439..0cda0d68 100644 --- a/tests/test_loaders/test_data_loader.py +++ b/tests/test_loaders/test_data_loader.py @@ -7,6 +7,34 @@ import yaml from linkml_map.loaders import DataLoader, FileFormat, load_data_file +from linkml_map.loaders.data_loaders import CsvFileLoader, TsvFileLoader, get_file_loader + +SCHEMA_WITH_ENUM = { + "id": "https://example.org/test", + "name": "test", + "prefixes": {"linkml": "https://w3id.org/linkml/"}, + "imports": ["linkml:types"], + "default_range": "string", + "classes": { + "Record": { + "attributes": { + "id": {"range": "integer", "identifier": True}, + "zipcode": {"range": "string"}, + "score": {"range": "score_enum"}, + "weight": {"range": "float"}, + } + } + }, + "enums": { + "score_enum": { + "permissible_values": { + "1": {"description": "Low"}, + "2": {"description": "Medium"}, + "3": {"description": "High"}, + } + } + }, +} @pytest.fixture @@ -313,3 +341,129 @@ def test_skip_empty_rows_false(self, tmp_path: Path) -> None: assert rows[1]["id"] == "P:002" # Empty string values are not included in the dict by linkml's loader assert "name" not in rows[1] or rows[1].get("name") == "" + + +# --- Schema-aware loading tests --- +# These verify that schema_path/target_class flow through to the underlying +# linkml loader so that string-ranged and enum-ranged columns are not +# coerced to int/float. + + +@pytest.fixture() +def schema_file(tmp_path: Path) -> Path: + """Write the test schema to a YAML file and return its path.""" + path = tmp_path / "schema.yaml" + path.write_text(yaml.dump(SCHEMA_WITH_ENUM)) + return path + + +@pytest.fixture() +def schema_aware_tsv(tmp_path: Path) -> Path: + """TSV with numeric-looking values in string and enum columns.""" + path = tmp_path / "Record.tsv" + path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n") + return path + + +@pytest.fixture() +def schema_aware_csv(tmp_path: Path) -> Path: + """CSV with numeric-looking values in string and enum columns.""" + path = tmp_path / "Record.csv" + path.write_text("id,zipcode,score,weight\n1,90210,2,3.5\n") + return path + + +def _assert_schema_aware_row(row: dict) -> None: + """Shared assertions for schema-aware loading: string/enum columns stay strings.""" + assert row["id"] == 1 + assert isinstance(row["id"], int) + assert row["zipcode"] == "90210" + assert isinstance(row["zipcode"], str) + assert row["score"] == "2" + assert isinstance(row["score"], str) + assert row["weight"] == 3.5 + assert isinstance(row["weight"], float) + + +class TestSchemaAwareTsvFileLoader: + """TsvFileLoader preserves string/enum columns when given a schema.""" + + def test_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None: + loader = TsvFileLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record") + row = next(loader.iter_instances()) + _assert_schema_aware_row(row) + + def test_without_schema_coerces(self, schema_aware_tsv: Path) -> None: + loader = TsvFileLoader(schema_aware_tsv) + row = next(loader.iter_instances()) + assert isinstance(row["zipcode"], int) + assert isinstance(row["score"], int) + + +class TestSchemaAwareCsvFileLoader: + """CsvFileLoader preserves string/enum columns when given a schema.""" + + def test_with_schema(self, schema_aware_csv: Path, schema_file: Path) -> None: + loader = CsvFileLoader(schema_aware_csv, schema_path=schema_file, target_class="Record") + row = next(loader.iter_instances()) + _assert_schema_aware_row(row) + + def test_without_schema_coerces(self, schema_aware_csv: Path) -> None: + loader = CsvFileLoader(schema_aware_csv) + row = next(loader.iter_instances()) + assert isinstance(row["zipcode"], int) + assert isinstance(row["score"], int) + + +class TestSchemaAwareGetFileLoader: + """get_file_loader forwards schema params to TSV/CSV loaders.""" + + @pytest.mark.parametrize("fixture_name", ["schema_aware_tsv", "schema_aware_csv"]) + def test_with_schema(self, fixture_name: str, schema_file: Path, request: pytest.FixtureRequest) -> None: + data_file = request.getfixturevalue(fixture_name) + loader = get_file_loader(data_file, schema_path=schema_file, target_class="Record") + row = next(loader.iter_instances()) + _assert_schema_aware_row(row) + + def test_ignored_for_yaml(self, tmp_path: Path, schema_file: Path) -> None: + """schema_path/target_class are accepted but ignored for non-tabular formats.""" + yaml_path = tmp_path / "data.yaml" + yaml_path.write_text(yaml.dump({"id": 1, "zipcode": "90210"})) + loader = get_file_loader(yaml_path, schema_path=schema_file, target_class="Record") + row = next(loader.iter_instances()) + assert row["id"] == 1 + + +class TestSchemaAwareDataLoader: + """DataLoader forwards schema params through to underlying loaders.""" + + def test_single_file_with_schema( + self, schema_aware_tsv: Path, schema_file: Path + ) -> None: + loader = DataLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record") + row = next(iter(loader)) + _assert_schema_aware_row(row) + + def test_directory_with_schema(self, tmp_path: Path, schema_file: Path) -> None: + tsv_path = tmp_path / "Record.tsv" + tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n") + loader = DataLoader(tmp_path, schema_path=schema_file, target_class="Record") + row = next(loader["Record"]) + _assert_schema_aware_row(row) + + def test_directory_without_schema_coerces(self, tmp_path: Path) -> None: + tsv_path = tmp_path / "Record.tsv" + tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n") + loader = DataLoader(tmp_path) + row = next(loader["Record"]) + assert isinstance(row["zipcode"], int) + + def test_iter_sources_with_schema( + self, schema_aware_tsv: Path, schema_file: Path + ) -> None: + loader = DataLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record") + sources = list(loader.iter_sources()) + assert len(sources) == 1 + _, rows = sources[0] + row = next(rows) + _assert_schema_aware_row(row) From ba1d4d422b25cb54989313c934f1b7f88154485a Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 22 May 2026 15:00:22 -0500 Subject: [PATCH 2/4] Simplify schema-param wiring in get_file_loader Drop the kwargs-only schema_path/target_class from get_file_loader and let them flow through **kwargs to the loader class. The DataLoader call sites add them to the same conditional block that already gates skip_empty_rows, so YAML/JSON paths never see them. Passing them to a non-tabular loader now raises TypeError instead of being silently swallowed. --- src/linkml_map/loaders/data_loaders.py | 23 +++++++---------------- tests/test_loaders/test_data_loader.py | 17 ++++++----------- 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/src/linkml_map/loaders/data_loaders.py b/src/linkml_map/loaders/data_loaders.py index 63c796dd..a5423bb7 100644 --- a/src/linkml_map/loaders/data_loaders.py +++ b/src/linkml_map/loaders/data_loaders.py @@ -137,9 +137,6 @@ def iter_instances(self) -> Iterator[dict[str, Any]]: def get_file_loader( path: str | Path, file_format: FileFormat | None = None, - *, - schema_path: str | Path | None = None, - target_class: str | None = None, **kwargs: Any, ) -> BaseFileLoader: """ @@ -147,9 +144,7 @@ def get_file_loader( :param path: Path to the file :param file_format: Explicit file format (auto-detected from extension if not provided) - :param schema_path: Path to the LinkML schema (enables schema-aware type coercion for TSV/CSV) - :param target_class: Target class name within the schema - :param kwargs: Additional arguments passed to the loader + :param kwargs: Additional arguments passed to the loader class :return: Appropriate file loader instance """ if file_format is None: @@ -167,10 +162,6 @@ def get_file_loader( msg = f"No loader available for format: {file_format}" raise ValueError(msg) - if file_format in (FileFormat.TSV, FileFormat.CSV): - kwargs["schema_path"] = schema_path - kwargs["target_class"] = target_class - return loader_class(path, **kwargs) @@ -314,10 +305,10 @@ def __getitem__(self, identifier: str) -> Iterator[dict[str, Any]]: file_format = FileFormat.from_extension(file_path) if file_format in (FileFormat.TSV, FileFormat.CSV): loader_kwargs["skip_empty_rows"] = self.skip_empty_rows + loader_kwargs["schema_path"] = self.schema_path + loader_kwargs["target_class"] = self.target_class - loader = get_file_loader( - file_path, schema_path=self.schema_path, target_class=self.target_class, **loader_kwargs - ) + loader = get_file_loader(file_path, **loader_kwargs) return loader.iter_instances() def __iter__(self) -> Iterator[dict[str, Any]]: @@ -330,10 +321,10 @@ def __iter__(self) -> Iterator[dict[str, Any]]: file_format = FileFormat.from_extension(self.base_path) if file_format in (FileFormat.TSV, FileFormat.CSV): loader_kwargs["skip_empty_rows"] = self.skip_empty_rows + loader_kwargs["schema_path"] = self.schema_path + loader_kwargs["target_class"] = self.target_class - loader = get_file_loader( - self.base_path, schema_path=self.schema_path, target_class=self.target_class, **loader_kwargs - ) + loader = get_file_loader(self.base_path, **loader_kwargs) yield from loader.iter_instances() def get_available_identifiers(self) -> list[str]: diff --git a/tests/test_loaders/test_data_loader.py b/tests/test_loaders/test_data_loader.py index 0cda0d68..8e05eab2 100644 --- a/tests/test_loaders/test_data_loader.py +++ b/tests/test_loaders/test_data_loader.py @@ -425,21 +425,18 @@ def test_with_schema(self, fixture_name: str, schema_file: Path, request: pytest row = next(loader.iter_instances()) _assert_schema_aware_row(row) - def test_ignored_for_yaml(self, tmp_path: Path, schema_file: Path) -> None: - """schema_path/target_class are accepted but ignored for non-tabular formats.""" + def test_rejected_for_yaml(self, tmp_path: Path, schema_file: Path) -> None: + """schema_path/target_class are not valid kwargs for non-tabular loaders.""" yaml_path = tmp_path / "data.yaml" yaml_path.write_text(yaml.dump({"id": 1, "zipcode": "90210"})) - loader = get_file_loader(yaml_path, schema_path=schema_file, target_class="Record") - row = next(loader.iter_instances()) - assert row["id"] == 1 + with pytest.raises(TypeError): + get_file_loader(yaml_path, schema_path=schema_file, target_class="Record") class TestSchemaAwareDataLoader: """DataLoader forwards schema params through to underlying loaders.""" - def test_single_file_with_schema( - self, schema_aware_tsv: Path, schema_file: Path - ) -> None: + def test_single_file_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None: loader = DataLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record") row = next(iter(loader)) _assert_schema_aware_row(row) @@ -458,9 +455,7 @@ def test_directory_without_schema_coerces(self, tmp_path: Path) -> None: row = next(loader["Record"]) assert isinstance(row["zipcode"], int) - def test_iter_sources_with_schema( - self, schema_aware_tsv: Path, schema_file: Path - ) -> None: + def test_iter_sources_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None: loader = DataLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record") sources = list(loader.iter_sources()) assert len(sources) == 1 From 6785bbb4355d081d84595670a9d4ac4c7e59ce57 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:34:17 -0500 Subject: [PATCH 3/4] Reshape schema-aware loading around SchemaView DataLoader takes a schemaview instead of schema_path/target_class. Target class is derived from each file's identifier (directory mode) or stem (single-file mode), so it composes with multi-file loading. A single bridge method maps the SchemaView to linkml's path-based delimited loader; that's the only spot to change when linkml accepts a SchemaView directly. Leaf loaders keep their path-based exposure to track linkml's API. --- src/linkml_map/cli/cli.py | 4 +-- src/linkml_map/loaders/data_loaders.py | 39 ++++++++++++++++++-------- tests/test_cli/test_cli_tabular.py | 35 +++++++++++++++++++++++ tests/test_loaders/test_data_loader.py | 35 +++++++++++++++++++++-- 4 files changed, 97 insertions(+), 16 deletions(-) diff --git a/src/linkml_map/cli/cli.py b/src/linkml_map/cli/cli.py index b118c99a..599c3750 100644 --- a/src/linkml_map/cli/cli.py +++ b/src/linkml_map/cli/cli.py @@ -395,8 +395,8 @@ def _map_data_streaming( if emit_spec: _emit_spec_to_file(tr, emit_spec) - # Initialize data loader - data_loader = DataLoader(input_path) + # Initialize data loader (schema enables type-preserving coercion for TSV/CSV) + data_loader = DataLoader(input_path, schemaview=tr.source_schemaview) # Set up error collection when continue-on-error is enabled errors: list[TransformationError] = [] diff --git a/src/linkml_map/loaders/data_loaders.py b/src/linkml_map/loaders/data_loaders.py index a5423bb7..ac24f830 100644 --- a/src/linkml_map/loaders/data_loaders.py +++ b/src/linkml_map/loaders/data_loaders.py @@ -8,6 +8,7 @@ from typing import Any import yaml +from linkml_runtime import SchemaView class FileFormat(str, Enum): @@ -144,7 +145,7 @@ def get_file_loader( :param path: Path to the file :param file_format: Explicit file format (auto-detected from extension if not provided) - :param kwargs: Additional arguments passed to the loader class + :param kwargs: Additional arguments passed to the loader :return: Appropriate file loader instance """ if file_format is None: @@ -193,8 +194,7 @@ def __init__( base_path: str | Path, default_format: FileFormat | None = None, skip_empty_rows: bool = True, - schema_path: str | Path | None = None, - target_class: str | None = None, + schemaview: SchemaView | None = None, ) -> None: """ Initialize the data loader. @@ -202,8 +202,8 @@ def __init__( :param base_path: Base directory containing data files, or a single file path :param default_format: Default format to use when extension is ambiguous :param skip_empty_rows: Skip empty rows in tabular files (default: True) - :param schema_path: Path to the LinkML schema (enables schema-aware type coercion for TSV/CSV) - :param target_class: Target class name within the schema + :param schemaview: Source schema (enables schema-aware type coercion for TSV/CSV). + The target class is derived from each file's identifier. :raises FileNotFoundError: If the path does not exist """ self.base_path = Path(base_path) @@ -212,8 +212,26 @@ def __init__( raise FileNotFoundError(msg) self.default_format = default_format self.skip_empty_rows = skip_empty_rows - self.schema_path = schema_path - self.target_class = target_class + self.schemaview = schemaview + + def _schema_loader_kwargs(self, identifier: str) -> dict[str, Any]: + """ + Build schema-aware kwargs for a TSV/CSV leaf loader. + + linkml's delimited loader currently takes a ``schema_path``, so we bridge + the in-scope :class:`SchemaView` to its source file. When that loader gains + native ``SchemaView`` support, this is the single spot that changes. + + :param identifier: Names the source class the file's rows conform to. + :return: ``schema_path``/``target_class`` kwargs, or empty if no schema is + available (in-memory schemas with no source file degrade to no coercion). + """ + if self.schemaview is None: + return {} + schema_path = self.schemaview.schema.source_file + if schema_path is None: + return {} + return {"schema_path": schema_path, "target_class": identifier} @property def is_single_file(self) -> bool: @@ -305,8 +323,7 @@ def __getitem__(self, identifier: str) -> Iterator[dict[str, Any]]: file_format = FileFormat.from_extension(file_path) if file_format in (FileFormat.TSV, FileFormat.CSV): loader_kwargs["skip_empty_rows"] = self.skip_empty_rows - loader_kwargs["schema_path"] = self.schema_path - loader_kwargs["target_class"] = self.target_class + loader_kwargs.update(self._schema_loader_kwargs(identifier)) loader = get_file_loader(file_path, **loader_kwargs) return loader.iter_instances() @@ -321,8 +338,8 @@ def __iter__(self) -> Iterator[dict[str, Any]]: file_format = FileFormat.from_extension(self.base_path) if file_format in (FileFormat.TSV, FileFormat.CSV): loader_kwargs["skip_empty_rows"] = self.skip_empty_rows - loader_kwargs["schema_path"] = self.schema_path - loader_kwargs["target_class"] = self.target_class + # Single-file mode: the file stem names the source class. + loader_kwargs.update(self._schema_loader_kwargs(self.base_path.stem)) loader = get_file_loader(self.base_path, **loader_kwargs) yield from loader.iter_instances() diff --git a/tests/test_cli/test_cli_tabular.py b/tests/test_cli/test_cli_tabular.py index 6dcf42fb..4537aa96 100644 --- a/tests/test_cli/test_cli_tabular.py +++ b/tests/test_cli/test_cli_tabular.py @@ -152,6 +152,41 @@ def test_tsv_input_jsonl_output( assert "id" in obj assert "label" in obj + def test_tsv_string_id_not_numerically_coerced( + self, + runner: CliRunner, + tmp_path: Path, + sample_schema: Path, + sample_transform: Path, + ) -> None: + """A numeric-looking string id keeps its leading zeros end-to-end. + + Without schema-aware loading, pandas-style inference coerces "00123" + to the int 123, losing the leading zeros and breaking downstream lookups. + """ + tsv_path = tmp_path / "Person.tsv" + tsv_path.write_text( + "id\tname\tprimary_email\tage_in_years\tgender\n00123\tAlice\talice@example.com\t30\tcisgender woman\n" + ) + result = runner.invoke( + main, + [ + "map-data", + "-T", + str(sample_transform), + "-s", + str(sample_schema), + "--source-type", + "Person", + "-f", + "jsonl", + str(tsv_path), + ], + ) + assert result.exit_code == 0 + obj = json.loads(result.stdout.strip()) + assert obj["id"] == "00123" + def test_tsv_input_tsv_output( self, runner: CliRunner, diff --git a/tests/test_loaders/test_data_loader.py b/tests/test_loaders/test_data_loader.py index 8e05eab2..3ea8cf82 100644 --- a/tests/test_loaders/test_data_loader.py +++ b/tests/test_loaders/test_data_loader.py @@ -5,6 +5,7 @@ import pytest import yaml +from linkml_runtime import SchemaView from linkml_map.loaders import DataLoader, FileFormat, load_data_file from linkml_map.loaders.data_loaders import CsvFileLoader, TsvFileLoader, get_file_loader @@ -437,14 +438,16 @@ class TestSchemaAwareDataLoader: """DataLoader forwards schema params through to underlying loaders.""" def test_single_file_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None: - loader = DataLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record") + # Single-file mode derives target_class from the file stem ("Record"). + loader = DataLoader(schema_aware_tsv, schemaview=SchemaView(str(schema_file))) row = next(iter(loader)) _assert_schema_aware_row(row) def test_directory_with_schema(self, tmp_path: Path, schema_file: Path) -> None: tsv_path = tmp_path / "Record.tsv" tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n") - loader = DataLoader(tmp_path, schema_path=schema_file, target_class="Record") + # Directory mode derives target_class from the identifier ("Record"). + loader = DataLoader(tmp_path, schemaview=SchemaView(str(schema_file))) row = next(loader["Record"]) _assert_schema_aware_row(row) @@ -456,9 +459,35 @@ def test_directory_without_schema_coerces(self, tmp_path: Path) -> None: assert isinstance(row["zipcode"], int) def test_iter_sources_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None: - loader = DataLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record") + loader = DataLoader(schema_aware_tsv, schemaview=SchemaView(str(schema_file))) sources = list(loader.iter_sources()) assert len(sources) == 1 _, rows = sources[0] row = next(rows) _assert_schema_aware_row(row) + + def test_directory_derives_target_class_per_identifier(self, tmp_path: Path) -> None: + """Each file's target_class is derived from its identifier, so the same + column name is coerced differently depending on its class's schema range.""" + schema = { + "id": "https://example.org/multi", + "name": "multi", + "prefixes": {"linkml": "https://w3id.org/linkml/"}, + "imports": ["linkml:types"], + "default_range": "string", + "classes": { + "Coded": {"attributes": {"code": {"range": "string"}}}, + "Numbered": {"attributes": {"code": {"range": "integer"}}}, + }, + } + schema_path = tmp_path / "multi.yaml" + schema_path.write_text(yaml.dump(schema)) + (tmp_path / "Coded.tsv").write_text("code\n007\n") + (tmp_path / "Numbered.tsv").write_text("code\n007\n") + + loader = DataLoader(tmp_path, schemaview=SchemaView(str(schema_path))) + coded = next(loader["Coded"]) + numbered = next(loader["Numbered"]) + + assert coded["code"] == "007" # string range preserved + assert numbered["code"] == 7 # integer range coerced From d6343a4fec4af65e9ebd001c6b69d554adfd6969 Mon Sep 17 00:00:00 2001 From: amc-corey-cox <69321580+amc-corey-cox@users.noreply.github.com> Date: Fri, 5 Jun 2026 10:05:45 -0500 Subject: [PATCH 4/4] Address PR review feedback Pin the get_file_loader TypeError test to the kwarg-rejection message so an unrelated TypeError can't pass it. --- tests/test_loaders/test_data_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_loaders/test_data_loader.py b/tests/test_loaders/test_data_loader.py index 3ea8cf82..9c18ab51 100644 --- a/tests/test_loaders/test_data_loader.py +++ b/tests/test_loaders/test_data_loader.py @@ -430,7 +430,7 @@ def test_rejected_for_yaml(self, tmp_path: Path, schema_file: Path) -> None: """schema_path/target_class are not valid kwargs for non-tabular loaders.""" yaml_path = tmp_path / "data.yaml" yaml_path.write_text(yaml.dump({"id": 1, "zipcode": "90210"})) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match="unexpected keyword argument"): get_file_loader(yaml_path, schema_path=schema_file, target_class="Record")