linkml · amc-corey-cox · Jun 5, 2026 · Mar 17, 2026 · May 22, 2026 · May 28, 2026
diff --git a/src/linkml_map/cli/cli.py b/src/linkml_map/cli/cli.py
@@ -395,8 +395,8 @@ def _map_data_streaming(
     if emit_spec:
         _emit_spec_to_file(tr, emit_spec)
 
-    # Initialize data loader
-    data_loader = DataLoader(input_path)
+    # Initialize data loader (schema enables type-preserving coercion for TSV/CSV)
+    data_loader = DataLoader(input_path, schemaview=tr.source_schemaview)
 
     # Set up error collection when continue-on-error is enabled
     errors: list[TransformationError] = []

diff --git a/src/linkml_map/loaders/data_loaders.py b/src/linkml_map/loaders/data_loaders.py
@@ -8,6 +8,7 @@
 from typing import Any
 
 import yaml
+from linkml_runtime import SchemaView
 
 
 class FileFormat(str, Enum):
@@ -83,16 +84,25 @@ def __init__(
         self,
         source: str | Path,
         skip_empty_rows: bool = True,
+        schema_path: str | Path | None = None,
+        target_class: str | None = None,
     ) -> None:
         """Initialize TSV loader."""
         super().__init__(source)
         self.skip_empty_rows = skip_empty_rows
+        self.schema_path = schema_path
+        self.target_class = target_class
 
     def iter_instances(self) -> Iterator[dict[str, Any]]:
         """Iterate over rows from the TSV file."""
         from linkml.validator.loaders import TsvLoader
 
-        loader = TsvLoader(str(self.source), skip_empty_rows=self.skip_empty_rows)
+        loader = TsvLoader(
+            str(self.source),
+            skip_empty_rows=self.skip_empty_rows,
+            schema_path=self.schema_path,
+            target_class=self.target_class,
+        )
         yield from loader.iter_instances()
 
 
@@ -103,16 +113,25 @@ def __init__(
         self,
         source: str | Path,
         skip_empty_rows: bool = True,
+        schema_path: str | Path | None = None,
+        target_class: str | None = None,
     ) -> None:
         """Initialize CSV loader."""
         super().__init__(source)
         self.skip_empty_rows = skip_empty_rows
+        self.schema_path = schema_path
+        self.target_class = target_class
 
     def iter_instances(self) -> Iterator[dict[str, Any]]:
         """Iterate over rows from the CSV file."""
         from linkml.validator.loaders import CsvLoader
 
-        loader = CsvLoader(str(self.source), skip_empty_rows=self.skip_empty_rows)
+        loader = CsvLoader(
+            str(self.source),
+            skip_empty_rows=self.skip_empty_rows,
+            schema_path=self.schema_path,
+            target_class=self.target_class,
+        )
         yield from loader.iter_instances()
 
 
@@ -175,13 +194,16 @@ def __init__(
         base_path: str | Path,
         default_format: FileFormat | None = None,
         skip_empty_rows: bool = True,
+        schemaview: SchemaView | None = None,
     ) -> None:
         """
         Initialize the data loader.
 
         :param base_path: Base directory containing data files, or a single file path
         :param default_format: Default format to use when extension is ambiguous
         :param skip_empty_rows: Skip empty rows in tabular files (default: True)
+        :param schemaview: Source schema (enables schema-aware type coercion for TSV/CSV).
+            The target class is derived from each file's identifier.
         :raises FileNotFoundError: If the path does not exist
         """
         self.base_path = Path(base_path)
@@ -190,6 +212,26 @@ def __init__(
             raise FileNotFoundError(msg)
         self.default_format = default_format
         self.skip_empty_rows = skip_empty_rows
+        self.schemaview = schemaview
+
+    def _schema_loader_kwargs(self, identifier: str) -> dict[str, Any]:
+        """
+        Build schema-aware kwargs for a TSV/CSV leaf loader.
+
+        linkml's delimited loader currently takes a ``schema_path``, so we bridge
+        the in-scope :class:`SchemaView` to its source file. When that loader gains
+        native ``SchemaView`` support, this is the single spot that changes.
+
+        :param identifier: Names the source class the file's rows conform to.
+        :return: ``schema_path``/``target_class`` kwargs, or empty if no schema is
+            available (in-memory schemas with no source file degrade to no coercion).
+        """
+        if self.schemaview is None:
+            return {}
+        schema_path = self.schemaview.schema.source_file
+        if schema_path is None:
+            return {}
+        return {"schema_path": schema_path, "target_class": identifier}
 
     @property
     def is_single_file(self) -> bool:
@@ -281,6 +323,7 @@ def __getitem__(self, identifier: str) -> Iterator[dict[str, Any]]:
         file_format = FileFormat.from_extension(file_path)
         if file_format in (FileFormat.TSV, FileFormat.CSV):
             loader_kwargs["skip_empty_rows"] = self.skip_empty_rows
+            loader_kwargs.update(self._schema_loader_kwargs(identifier))
 
         loader = get_file_loader(file_path, **loader_kwargs)
         return loader.iter_instances()
@@ -295,6 +338,8 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
         file_format = FileFormat.from_extension(self.base_path)
         if file_format in (FileFormat.TSV, FileFormat.CSV):
             loader_kwargs["skip_empty_rows"] = self.skip_empty_rows
+            # Single-file mode: the file stem names the source class.
+            loader_kwargs.update(self._schema_loader_kwargs(self.base_path.stem))
 
         loader = get_file_loader(self.base_path, **loader_kwargs)
         yield from loader.iter_instances()

diff --git a/tests/test_cli/test_cli_tabular.py b/tests/test_cli/test_cli_tabular.py
@@ -152,6 +152,41 @@ def test_tsv_input_jsonl_output(
             assert "id" in obj
             assert "label" in obj
 
+    def test_tsv_string_id_not_numerically_coerced(
+        self,
+        runner: CliRunner,
+        tmp_path: Path,
+        sample_schema: Path,
+        sample_transform: Path,
+    ) -> None:
+        """A numeric-looking string id keeps its leading zeros end-to-end.
+
+        Without schema-aware loading, pandas-style inference coerces "00123"
+        to the int 123, losing the leading zeros and breaking downstream lookups.
+        """
+        tsv_path = tmp_path / "Person.tsv"
+        tsv_path.write_text(
+            "id\tname\tprimary_email\tage_in_years\tgender\n00123\tAlice\talice@example.com\t30\tcisgender woman\n"
+        )
+        result = runner.invoke(
+            main,
+            [
+                "map-data",
+                "-T",
+                str(sample_transform),
+                "-s",
+                str(sample_schema),
+                "--source-type",
+                "Person",
+                "-f",
+                "jsonl",
+                str(tsv_path),
+            ],
+        )
+        assert result.exit_code == 0
+        obj = json.loads(result.stdout.strip())
+        assert obj["id"] == "00123"
+
     def test_tsv_input_tsv_output(
         self,
         runner: CliRunner,

diff --git a/tests/test_loaders/test_data_loader.py b/tests/test_loaders/test_data_loader.py
@@ -5,8 +5,37 @@
 
 import pytest
 import yaml
+from linkml_runtime import SchemaView
 
 from linkml_map.loaders import DataLoader, FileFormat, load_data_file
+from linkml_map.loaders.data_loaders import CsvFileLoader, TsvFileLoader, get_file_loader
+
+SCHEMA_WITH_ENUM = {
+    "id": "https://example.org/test",
+    "name": "test",
+    "prefixes": {"linkml": "https://w3id.org/linkml/"},
+    "imports": ["linkml:types"],
+    "default_range": "string",
+    "classes": {
+        "Record": {
+            "attributes": {
+                "id": {"range": "integer", "identifier": True},
+                "zipcode": {"range": "string"},
+                "score": {"range": "score_enum"},
+                "weight": {"range": "float"},
+            }
+        }
+    },
+    "enums": {
+        "score_enum": {
+            "permissible_values": {
+                "1": {"description": "Low"},
+                "2": {"description": "Medium"},
+                "3": {"description": "High"},
+            }
+        }
+    },
+}
 
 
 @pytest.fixture
@@ -313,3 +342,152 @@ def test_skip_empty_rows_false(self, tmp_path: Path) -> None:
         assert rows[1]["id"] == "P:002"
         # Empty string values are not included in the dict by linkml's loader
         assert "name" not in rows[1] or rows[1].get("name") == ""
+
+
+# --- Schema-aware loading tests ---
+# These verify that schema_path/target_class flow through to the underlying
+# linkml loader so that string-ranged and enum-ranged columns are not
+# coerced to int/float.
+
+
+@pytest.fixture()
+def schema_file(tmp_path: Path) -> Path:
+    """Write the test schema to a YAML file and return its path."""
+    path = tmp_path / "schema.yaml"
+    path.write_text(yaml.dump(SCHEMA_WITH_ENUM))
+    return path
+
+
+@pytest.fixture()
+def schema_aware_tsv(tmp_path: Path) -> Path:
+    """TSV with numeric-looking values in string and enum columns."""
+    path = tmp_path / "Record.tsv"
+    path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n")
+    return path
+
+
+@pytest.fixture()
+def schema_aware_csv(tmp_path: Path) -> Path:
+    """CSV with numeric-looking values in string and enum columns."""
+    path = tmp_path / "Record.csv"
+    path.write_text("id,zipcode,score,weight\n1,90210,2,3.5\n")
+    return path
+
+
+def _assert_schema_aware_row(row: dict) -> None:
+    """Shared assertions for schema-aware loading: string/enum columns stay strings."""
+    assert row["id"] == 1
+    assert isinstance(row["id"], int)
+    assert row["zipcode"] == "90210"
+    assert isinstance(row["zipcode"], str)
+    assert row["score"] == "2"
+    assert isinstance(row["score"], str)
+    assert row["weight"] == 3.5
+    assert isinstance(row["weight"], float)
+
+
+class TestSchemaAwareTsvFileLoader:
+    """TsvFileLoader preserves string/enum columns when given a schema."""
+
+    def test_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None:
+        loader = TsvFileLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record")
+        row = next(loader.iter_instances())
+        _assert_schema_aware_row(row)
+
+    def test_without_schema_coerces(self, schema_aware_tsv: Path) -> None:
+        loader = TsvFileLoader(schema_aware_tsv)
+        row = next(loader.iter_instances())
+        assert isinstance(row["zipcode"], int)
+        assert isinstance(row["score"], int)
+
+
+class TestSchemaAwareCsvFileLoader:
+    """CsvFileLoader preserves string/enum columns when given a schema."""
+
+    def test_with_schema(self, schema_aware_csv: Path, schema_file: Path) -> None:
+        loader = CsvFileLoader(schema_aware_csv, schema_path=schema_file, target_class="Record")
+        row = next(loader.iter_instances())
+        _assert_schema_aware_row(row)
+
+    def test_without_schema_coerces(self, schema_aware_csv: Path) -> None:
+        loader = CsvFileLoader(schema_aware_csv)
+        row = next(loader.iter_instances())
+        assert isinstance(row["zipcode"], int)
+        assert isinstance(row["score"], int)
+
+
+class TestSchemaAwareGetFileLoader:
+    """get_file_loader forwards schema params to TSV/CSV loaders."""
+
+    @pytest.mark.parametrize("fixture_name", ["schema_aware_tsv", "schema_aware_csv"])
+    def test_with_schema(self, fixture_name: str, schema_file: Path, request: pytest.FixtureRequest) -> None:
+        data_file = request.getfixturevalue(fixture_name)
+        loader = get_file_loader(data_file, schema_path=schema_file, target_class="Record")
+        row = next(loader.iter_instances())
+        _assert_schema_aware_row(row)
+
+    def test_rejected_for_yaml(self, tmp_path: Path, schema_file: Path) -> None:
+        """schema_path/target_class are not valid kwargs for non-tabular loaders."""
+        yaml_path = tmp_path / "data.yaml"
+        yaml_path.write_text(yaml.dump({"id": 1, "zipcode": "90210"}))
+        with pytest.raises(TypeError, match="unexpected keyword argument"):
+            get_file_loader(yaml_path, schema_path=schema_file, target_class="Record")
+
+
+class TestSchemaAwareDataLoader:
+    """DataLoader forwards schema params through to underlying loaders."""
+
+    def test_single_file_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None:
+        # Single-file mode derives target_class from the file stem ("Record").
+        loader = DataLoader(schema_aware_tsv, schemaview=SchemaView(str(schema_file)))
+        row = next(iter(loader))
+        _assert_schema_aware_row(row)
+
+    def test_directory_with_schema(self, tmp_path: Path, schema_file: Path) -> None:
+        tsv_path = tmp_path / "Record.tsv"
+        tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n")
+        # Directory mode derives target_class from the identifier ("Record").
+        loader = DataLoader(tmp_path, schemaview=SchemaView(str(schema_file)))
+        row = next(loader["Record"])
+        _assert_schema_aware_row(row)
+
+    def test_directory_without_schema_coerces(self, tmp_path: Path) -> None:
+        tsv_path = tmp_path / "Record.tsv"
+        tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n")
+        loader = DataLoader(tmp_path)
+        row = next(loader["Record"])
+        assert isinstance(row["zipcode"], int)
+
+    def test_iter_sources_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None:
+        loader = DataLoader(schema_aware_tsv, schemaview=SchemaView(str(schema_file)))
+        sources = list(loader.iter_sources())
+        assert len(sources) == 1
+        _, rows = sources[0]
+        row = next(rows)
+        _assert_schema_aware_row(row)
+
+    def test_directory_derives_target_class_per_identifier(self, tmp_path: Path) -> None:
+        """Each file's target_class is derived from its identifier, so the same
+        column name is coerced differently depending on its class's schema range."""
+        schema = {
+            "id": "https://example.org/multi",
+            "name": "multi",
+            "prefixes": {"linkml": "https://w3id.org/linkml/"},
+            "imports": ["linkml:types"],
+            "default_range": "string",
+            "classes": {
+                "Coded": {"attributes": {"code": {"range": "string"}}},
+                "Numbered": {"attributes": {"code": {"range": "integer"}}},
+            },
+        }
+        schema_path = tmp_path / "multi.yaml"
+        schema_path.write_text(yaml.dump(schema))
+        (tmp_path / "Coded.tsv").write_text("code\n007\n")
+        (tmp_path / "Numbered.tsv").write_text("code\n007\n")
+
+        loader = DataLoader(tmp_path, schemaview=SchemaView(str(schema_path)))
+        coded = next(loader["Coded"])
+        numbered = next(loader["Numbered"])
+
+        assert coded["code"] == "007"  # string range preserved
+        assert numbered["code"] == 7  # integer range coerced