Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/linkml_map/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,8 @@ def _map_data_streaming(
if emit_spec:
_emit_spec_to_file(tr, emit_spec)

# Initialize data loader
data_loader = DataLoader(input_path)
# Initialize data loader (schema enables type-preserving coercion for TSV/CSV)
data_loader = DataLoader(input_path, schemaview=tr.source_schemaview)

# Set up error collection when continue-on-error is enabled
errors: list[TransformationError] = []
Expand Down
49 changes: 47 additions & 2 deletions src/linkml_map/loaders/data_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Any

import yaml
from linkml_runtime import SchemaView


class FileFormat(str, Enum):
Expand Down Expand Up @@ -83,16 +84,25 @@ def __init__(
self,
source: str | Path,
skip_empty_rows: bool = True,
schema_path: str | Path | None = None,
Comment thread
amc-corey-cox marked this conversation as resolved.
target_class: str | None = None,
) -> None:
"""Initialize TSV loader."""
super().__init__(source)
self.skip_empty_rows = skip_empty_rows
self.schema_path = schema_path
self.target_class = target_class

def iter_instances(self) -> Iterator[dict[str, Any]]:
"""Iterate over rows from the TSV file."""
from linkml.validator.loaders import TsvLoader

loader = TsvLoader(str(self.source), skip_empty_rows=self.skip_empty_rows)
loader = TsvLoader(
str(self.source),
skip_empty_rows=self.skip_empty_rows,
schema_path=self.schema_path,
target_class=self.target_class,
)
yield from loader.iter_instances()


Expand All @@ -103,16 +113,25 @@ def __init__(
self,
source: str | Path,
skip_empty_rows: bool = True,
schema_path: str | Path | None = None,
target_class: str | None = None,
) -> None:
"""Initialize CSV loader."""
super().__init__(source)
self.skip_empty_rows = skip_empty_rows
self.schema_path = schema_path
self.target_class = target_class

def iter_instances(self) -> Iterator[dict[str, Any]]:
"""Iterate over rows from the CSV file."""
from linkml.validator.loaders import CsvLoader

loader = CsvLoader(str(self.source), skip_empty_rows=self.skip_empty_rows)
loader = CsvLoader(
str(self.source),
skip_empty_rows=self.skip_empty_rows,
schema_path=self.schema_path,
target_class=self.target_class,
)
yield from loader.iter_instances()


Expand Down Expand Up @@ -175,13 +194,16 @@ def __init__(
base_path: str | Path,
default_format: FileFormat | None = None,
skip_empty_rows: bool = True,
schemaview: SchemaView | None = None,
) -> None:
"""
Initialize the data loader.

:param base_path: Base directory containing data files, or a single file path
:param default_format: Default format to use when extension is ambiguous
:param skip_empty_rows: Skip empty rows in tabular files (default: True)
:param schemaview: Source schema (enables schema-aware type coercion for TSV/CSV).
The target class is derived from each file's identifier.
:raises FileNotFoundError: If the path does not exist
"""
self.base_path = Path(base_path)
Expand All @@ -190,6 +212,26 @@ def __init__(
raise FileNotFoundError(msg)
self.default_format = default_format
self.skip_empty_rows = skip_empty_rows
self.schemaview = schemaview

def _schema_loader_kwargs(self, identifier: str) -> dict[str, Any]:
"""
Build schema-aware kwargs for a TSV/CSV leaf loader.

linkml's delimited loader currently takes a ``schema_path``, so we bridge
the in-scope :class:`SchemaView` to its source file. When that loader gains
native ``SchemaView`` support, this is the single spot that changes.

:param identifier: Names the source class the file's rows conform to.
:return: ``schema_path``/``target_class`` kwargs, or empty if no schema is
available (in-memory schemas with no source file degrade to no coercion).
"""
if self.schemaview is None:
return {}
schema_path = self.schemaview.schema.source_file
if schema_path is None:
return {}
return {"schema_path": schema_path, "target_class": identifier}

@property
def is_single_file(self) -> bool:
Expand Down Expand Up @@ -281,6 +323,7 @@ def __getitem__(self, identifier: str) -> Iterator[dict[str, Any]]:
file_format = FileFormat.from_extension(file_path)
if file_format in (FileFormat.TSV, FileFormat.CSV):
loader_kwargs["skip_empty_rows"] = self.skip_empty_rows
loader_kwargs.update(self._schema_loader_kwargs(identifier))

loader = get_file_loader(file_path, **loader_kwargs)
return loader.iter_instances()
Expand All @@ -295,6 +338,8 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
file_format = FileFormat.from_extension(self.base_path)
if file_format in (FileFormat.TSV, FileFormat.CSV):
loader_kwargs["skip_empty_rows"] = self.skip_empty_rows
# Single-file mode: the file stem names the source class.
loader_kwargs.update(self._schema_loader_kwargs(self.base_path.stem))

loader = get_file_loader(self.base_path, **loader_kwargs)
yield from loader.iter_instances()
Expand Down
35 changes: 35 additions & 0 deletions tests/test_cli/test_cli_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,41 @@ def test_tsv_input_jsonl_output(
assert "id" in obj
assert "label" in obj

def test_tsv_string_id_not_numerically_coerced(
self,
runner: CliRunner,
tmp_path: Path,
sample_schema: Path,
sample_transform: Path,
) -> None:
"""A numeric-looking string id keeps its leading zeros end-to-end.

Without schema-aware loading, pandas-style inference coerces "00123"
to the int 123, losing the leading zeros and breaking downstream lookups.
"""
tsv_path = tmp_path / "Person.tsv"
tsv_path.write_text(
"id\tname\tprimary_email\tage_in_years\tgender\n00123\tAlice\talice@example.com\t30\tcisgender woman\n"
)
result = runner.invoke(
main,
[
"map-data",
"-T",
str(sample_transform),
"-s",
str(sample_schema),
"--source-type",
"Person",
"-f",
"jsonl",
str(tsv_path),
],
)
assert result.exit_code == 0
obj = json.loads(result.stdout.strip())
assert obj["id"] == "00123"

def test_tsv_input_tsv_output(
self,
runner: CliRunner,
Expand Down
178 changes: 178 additions & 0 deletions tests/test_loaders/test_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,37 @@

import pytest
import yaml
from linkml_runtime import SchemaView

from linkml_map.loaders import DataLoader, FileFormat, load_data_file
from linkml_map.loaders.data_loaders import CsvFileLoader, TsvFileLoader, get_file_loader

SCHEMA_WITH_ENUM = {
"id": "https://example.org/test",
"name": "test",
"prefixes": {"linkml": "https://w3id.org/linkml/"},
"imports": ["linkml:types"],
"default_range": "string",
"classes": {
"Record": {
"attributes": {
"id": {"range": "integer", "identifier": True},
"zipcode": {"range": "string"},
"score": {"range": "score_enum"},
"weight": {"range": "float"},
}
}
},
"enums": {
"score_enum": {
"permissible_values": {
"1": {"description": "Low"},
"2": {"description": "Medium"},
"3": {"description": "High"},
}
}
},
}


@pytest.fixture
Expand Down Expand Up @@ -313,3 +342,152 @@ def test_skip_empty_rows_false(self, tmp_path: Path) -> None:
assert rows[1]["id"] == "P:002"
# Empty string values are not included in the dict by linkml's loader
assert "name" not in rows[1] or rows[1].get("name") == ""


# --- Schema-aware loading tests ---
# These verify that schema_path/target_class flow through to the underlying
# linkml loader so that string-ranged and enum-ranged columns are not
# coerced to int/float.


@pytest.fixture()
def schema_file(tmp_path: Path) -> Path:
"""Write the test schema to a YAML file and return its path."""
path = tmp_path / "schema.yaml"
path.write_text(yaml.dump(SCHEMA_WITH_ENUM))
return path


@pytest.fixture()
def schema_aware_tsv(tmp_path: Path) -> Path:
"""TSV with numeric-looking values in string and enum columns."""
path = tmp_path / "Record.tsv"
path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n")
return path


@pytest.fixture()
def schema_aware_csv(tmp_path: Path) -> Path:
"""CSV with numeric-looking values in string and enum columns."""
path = tmp_path / "Record.csv"
path.write_text("id,zipcode,score,weight\n1,90210,2,3.5\n")
return path


def _assert_schema_aware_row(row: dict) -> None:
"""Shared assertions for schema-aware loading: string/enum columns stay strings."""
assert row["id"] == 1
assert isinstance(row["id"], int)
assert row["zipcode"] == "90210"
assert isinstance(row["zipcode"], str)
assert row["score"] == "2"
assert isinstance(row["score"], str)
assert row["weight"] == 3.5
assert isinstance(row["weight"], float)


class TestSchemaAwareTsvFileLoader:
"""TsvFileLoader preserves string/enum columns when given a schema."""

def test_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None:
loader = TsvFileLoader(schema_aware_tsv, schema_path=schema_file, target_class="Record")
row = next(loader.iter_instances())
_assert_schema_aware_row(row)

def test_without_schema_coerces(self, schema_aware_tsv: Path) -> None:
loader = TsvFileLoader(schema_aware_tsv)
row = next(loader.iter_instances())
assert isinstance(row["zipcode"], int)
assert isinstance(row["score"], int)


class TestSchemaAwareCsvFileLoader:
"""CsvFileLoader preserves string/enum columns when given a schema."""

def test_with_schema(self, schema_aware_csv: Path, schema_file: Path) -> None:
loader = CsvFileLoader(schema_aware_csv, schema_path=schema_file, target_class="Record")
row = next(loader.iter_instances())
_assert_schema_aware_row(row)

def test_without_schema_coerces(self, schema_aware_csv: Path) -> None:
loader = CsvFileLoader(schema_aware_csv)
row = next(loader.iter_instances())
assert isinstance(row["zipcode"], int)
assert isinstance(row["score"], int)


class TestSchemaAwareGetFileLoader:
"""get_file_loader forwards schema params to TSV/CSV loaders."""

@pytest.mark.parametrize("fixture_name", ["schema_aware_tsv", "schema_aware_csv"])
def test_with_schema(self, fixture_name: str, schema_file: Path, request: pytest.FixtureRequest) -> None:
data_file = request.getfixturevalue(fixture_name)
loader = get_file_loader(data_file, schema_path=schema_file, target_class="Record")
row = next(loader.iter_instances())
_assert_schema_aware_row(row)

def test_rejected_for_yaml(self, tmp_path: Path, schema_file: Path) -> None:
"""schema_path/target_class are not valid kwargs for non-tabular loaders."""
yaml_path = tmp_path / "data.yaml"
yaml_path.write_text(yaml.dump({"id": 1, "zipcode": "90210"}))
with pytest.raises(TypeError, match="unexpected keyword argument"):
get_file_loader(yaml_path, schema_path=schema_file, target_class="Record")


class TestSchemaAwareDataLoader:
"""DataLoader forwards schema params through to underlying loaders."""

def test_single_file_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None:
# Single-file mode derives target_class from the file stem ("Record").
loader = DataLoader(schema_aware_tsv, schemaview=SchemaView(str(schema_file)))
row = next(iter(loader))
_assert_schema_aware_row(row)

def test_directory_with_schema(self, tmp_path: Path, schema_file: Path) -> None:
tsv_path = tmp_path / "Record.tsv"
tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n")
# Directory mode derives target_class from the identifier ("Record").
loader = DataLoader(tmp_path, schemaview=SchemaView(str(schema_file)))
row = next(loader["Record"])
_assert_schema_aware_row(row)

def test_directory_without_schema_coerces(self, tmp_path: Path) -> None:
tsv_path = tmp_path / "Record.tsv"
tsv_path.write_text("id\tzipcode\tscore\tweight\n1\t90210\t2\t3.5\n")
loader = DataLoader(tmp_path)
row = next(loader["Record"])
assert isinstance(row["zipcode"], int)

def test_iter_sources_with_schema(self, schema_aware_tsv: Path, schema_file: Path) -> None:
loader = DataLoader(schema_aware_tsv, schemaview=SchemaView(str(schema_file)))
sources = list(loader.iter_sources())
assert len(sources) == 1
_, rows = sources[0]
row = next(rows)
_assert_schema_aware_row(row)

def test_directory_derives_target_class_per_identifier(self, tmp_path: Path) -> None:
"""Each file's target_class is derived from its identifier, so the same
column name is coerced differently depending on its class's schema range."""
schema = {
"id": "https://example.org/multi",
"name": "multi",
"prefixes": {"linkml": "https://w3id.org/linkml/"},
"imports": ["linkml:types"],
"default_range": "string",
"classes": {
"Coded": {"attributes": {"code": {"range": "string"}}},
"Numbered": {"attributes": {"code": {"range": "integer"}}},
},
}
schema_path = tmp_path / "multi.yaml"
schema_path.write_text(yaml.dump(schema))
(tmp_path / "Coded.tsv").write_text("code\n007\n")
(tmp_path / "Numbered.tsv").write_text("code\n007\n")

loader = DataLoader(tmp_path, schemaview=SchemaView(str(schema_path)))
coded = next(loader["Coded"])
numbered = next(loader["Numbered"])

assert coded["code"] == "007" # string range preserved
assert numbered["code"] == 7 # integer range coerced
Loading