fc

praateekmahajan · praateekmahajan · commit dbf9a522afca · 2025-11-07T13:36:45.000-08:00
Signed-off-by: Praateek &lt;praateekm@gmail.com&gt;
diff --git a/nemo_curator/stages/deduplication/semantic/identify_duplicates.py b/nemo_curator/stages/deduplication/semantic/identify_duplicates.py
@@ -89,13 +89,21 @@ def process_batch(self, tasks: list[FileGroupTask]) -> list[FileGroupTask]:
 
         all_files = [file for task in tasks for file in task.data]
         # Read using filters
-        df: pd.DataFrame = pd.read_parquet(
-            all_files,
-            storage_options=self.input_storage_options,
-            **self.read_kwargs,
-            filters=[("cosine_sim_score", ">=", 1.0 - self.eps)],
-            engine="pyarrow",
-        )[["id"]]  # TODO: If we want we can add other columns
+
+        df: pd.DataFrame = pd.concat(
+            [
+                pd.read_parquet(
+                    f,
+                    storage_options=self.input_storage_options,
+                    **self.read_kwargs,
+                    filters=[("cosine_sim_score", ">=", 1.0 - self.eps)],
+                    columns=["id"],
+                    engine="pyarrow",
+                )
+                for f in all_files
+            ],
+            ignore_index=True,
+        )
         # Write out sorted and with multiple row groups
         df.sort_values("id", inplace=True)  # noqa: PD002
 
diff --git a/nemo_curator/stages/deduplication/semantic/pairwise_io.py b/nemo_curator/stages/deduplication/semantic/pairwise_io.py
@@ -21,6 +21,7 @@
 from nemo_curator.stages.base import ProcessingStage
 from nemo_curator.stages.resources import Resources
 from nemo_curator.tasks import FileGroupTask, _EmptyTask
+from nemo_curator.utils.client_utils import is_remote_url
 from nemo_curator.utils.file_utils import get_all_file_paths_under, get_fs, infer_dataset_name_from_path
 
 if TYPE_CHECKING:
@@ -52,6 +53,7 @@ def __init__(
         self._name = "pairwise_file_partitioning"
         self._resources = Resources(cpus=0.5)
         self.fs: AbstractFileSystem | None = None
+        self.path_normalizer = lambda x: x
 
     def inputs(self) -> tuple[list[str], list[str]]:
         return ["data"], []
@@ -61,6 +63,7 @@ def outputs(self) -> tuple[list[str], list[str]]:
 
     def setup(self, _: WorkerMetadata | None = None) -> None:
         self.fs = get_fs(self.input_path, storage_options=self.storage_options)
+        self.path_normalizer = self.fs.unstrip_protocol if is_remote_url(self.input_path) else (lambda x: x)
 
     def ray_stage_spec(self) -> dict[str, Any]:
         """Ray stage specification for this stage."""
@@ -83,7 +86,7 @@ def process(self, _: _EmptyTask) -> list[FileGroupTask]:
             # Extract centroid ID from directory name (e.g., "centroid=0" -> 0)
             if "centroid=" in entry:
                 centroid_id = int(entry.split("centroid=")[-1])
-                centroid_dirs[centroid_id] = entry
+                centroid_dirs[centroid_id] = self.path_normalizer(entry)
 
         logger.debug(
             f"Found {len(centroid_dirs)} centroid directories e.g. {next(iter(centroid_dirs.values())) if centroid_dirs else None}"
diff --git a/nemo_curator/stages/text/deduplication/removal.py b/nemo_curator/stages/text/deduplication/removal.py
@@ -31,6 +31,7 @@
 from nemo_curator.stages.base import ProcessingStage
 from nemo_curator.stages.deduplication.id_generator import CURATOR_DEDUP_ID_STR
 from nemo_curator.tasks import DocumentBatch
+from nemo_curator.utils.file_utils import get_fs
 
 
 @dataclass
@@ -57,6 +58,8 @@ def __post_init__(self):
         super().__init__()
         self._name = "DuplicatesRemovalStage"
         self.read_kwargs = self.read_kwargs.copy() if self.read_kwargs else {}
+        # TODO : I think we can remove this
+        self.fs = get_fs(self.ids_to_remove_path, storage_options=self.read_kwargs.get("storage_options", {}))
 
     def process(self, task: DocumentBatch) -> DocumentBatch:
         """
@@ -72,17 +75,21 @@ def process(self, task: DocumentBatch) -> DocumentBatch:
         input_df_min_max_time = time.perf_counter() - input_df_t0
         # Filter the parquet files for IDs to remove within this range
         read_dupes_t0 = time.perf_counter()
-        removal_df = pd.read_parquet(
+
+        # we use pq.read_table instead of pd.read_parquet since ids_to_remove_path is a directory
+        # and it might error out when the directory is a cloud path
+        removal_table = pd.read_parquet(
             self.ids_to_remove_path,
             filters=[(self.duplicate_id_field, ">=", min_id), (self.duplicate_id_field, "<=", max_id)],
             columns=[self.duplicate_id_field],
-            **self.read_kwargs,
+            **self.read_kwargs,  # this might fail if filesystem exists in read_kwargs
         )
+
         read_dupes_time = time.perf_counter() - read_dupes_t0
 
         # Filter out documents with IDs in the removal set using pandas
         time_to_remove_t0 = time.perf_counter()
-        removal_ids = set(removal_df[self.duplicate_id_field].tolist())
+        removal_ids = set(removal_table[self.duplicate_id_field].tolist())
         df = df[~df[self.id_field].isin(removal_ids)]
         removal_ids_time = time.perf_counter() - time_to_remove_t0
         self._log_metrics(
diff --git a/nemo_curator/stages/text/io/reader/parquet.py b/nemo_curator/stages/text/io/reader/parquet.py
@@ -20,7 +20,7 @@
 from nemo_curator.stages.base import CompositeStage
 from nemo_curator.stages.file_partitioning import FilePartitioningStage
 from nemo_curator.tasks import DocumentBatch, _EmptyTask
-from nemo_curator.utils.file_utils import FILETYPE_TO_DEFAULT_EXTENSIONS
+from nemo_curator.utils.file_utils import FILETYPE_TO_DEFAULT_EXTENSIONS, get_fs
 
 from .base import BaseReader
 
@@ -59,7 +59,12 @@ def read_data(
         if "dtype_backend" not in read_kwargs:
             update_kwargs["dtype_backend"] = "pyarrow"
         read_kwargs.update(update_kwargs)
-        return pd.read_parquet(paths, **read_kwargs)
+
+        # TODO generating filesystem for each task will be inefficient, we should benchmark pq.read_table  # noqa: TD004
+        fs = get_fs(paths[0], storage_options=read_kwargs.get("storage_options", {}))
+        # pop storage_options from read_kwargs
+        read_kwargs.pop("storage_options", None)
+        return pd.read_parquet(paths, filesystem=fs, **read_kwargs)
 
 
 @dataclass
diff --git a/nemo_curator/stages/text/io/writer/base.py b/nemo_curator/stages/text/io/writer/base.py
@@ -17,13 +17,13 @@
 from dataclasses import dataclass, field
 from typing import Any, Literal
 
-import fsspec
-from fsspec.utils import infer_storage_options
+from fsspec.core import url_to_fs
 from loguru import logger
 
 import nemo_curator.stages.text.io.writer.utils as writer_utils
 from nemo_curator.stages.base import ProcessingStage
 from nemo_curator.tasks import DocumentBatch, FileGroupTask
+from nemo_curator.utils.client_utils import is_remote_url
 from nemo_curator.utils.file_utils import check_output_mode
 
 
@@ -41,25 +41,16 @@ class BaseWriter(ProcessingStage[DocumentBatch, FileGroupTask], ABC):
     fields: list[str] | None = None
     mode: Literal["ignore", "overwrite", "append", "error"] = "ignore"
     _name: str = "BaseWriter"
-    _fs_path: str = field(init=False, repr=False, default="")
-    _protocol: str = field(init=False, repr=False, default="file")
-    _has_explicit_protocol: bool = field(init=False, repr=False, default=False)
     append_mode_implemented: bool = False
 
     def __post_init__(self):
-        # Determine protocol and normalized filesystem path
-        path_opts = infer_storage_options(self.path)
-        protocol = path_opts.get("protocol", "file")
-        self._protocol = protocol or "file"
-        # Track if the user provided an explicit URL-style protocol in the path
-        self._has_explicit_protocol = "://" in self.path
-        # Use the filesystem-native path (no protocol) for fs operations
-        self._fs_path = path_opts.get("path", self.path)
-
-        # Only pass user-provided storage options to fsspec
+        # Use fsspec's url_to_fs to get both filesystem and normalized path
         self.storage_options = (self.write_kwargs or {}).get("storage_options", {})
-        self.fs = fsspec.filesystem(protocol, **self.storage_options)
+        self.fs, self._fs_path = url_to_fs(self.path, **self.storage_options)
         check_output_mode(self.mode, self.fs, self._fs_path, append_mode_implemented=self.append_mode_implemented)
+        logger.info(
+            f"Initialized writer for {self.path} with filesystem {self.fs} and storage_options {self.storage_options}"
+        )
 
     def inputs(self) -> tuple[list[str], list[str]]:
         return ["data"], []
@@ -95,17 +86,22 @@ def process(self, task: DocumentBatch) -> FileGroupTask:
         file_extension = self.get_file_extension()
         file_path = self.fs.sep.join([self._fs_path, f"{filename}.{file_extension}"])
 
+        # For remote URLs, restore the protocol prefix so downstream code can infer the filesystem
+        file_path_with_protocol = self.fs.unstrip_protocol(file_path) if is_remote_url(self.path) else file_path
+
+        logger.info(f"Writing {task.num_items} records to {file_path_with_protocol} with filesystem {self.fs}")
+
         if self.fs.exists(file_path):
-            logger.debug(f"File {file_path} already exists, overwriting it")
+            logger.debug(f"File {file_path_with_protocol} already exists, overwriting it")
 
-        self.write_data(task, file_path)
-        logger.debug(f"Written {task.num_items} records to {file_path}")
+        self.write_data(task, file_path_with_protocol)
+        logger.debug(f"Written {task.num_items} records to {file_path_with_protocol}")
 
-        # Create FileGroupTask with written files
+        # Create FileGroupTask with written files using the full protocol-prefixed path
         return FileGroupTask(
             task_id=task.task_id,
             dataset_name=task.dataset_name,
-            data=[file_path],
+            data=[file_path_with_protocol],
             _metadata={
                 **task._metadata,
                 "format": self.get_file_extension(),
diff --git a/nemo_curator/stages/text/io/writer/parquet.py b/nemo_curator/stages/text/io/writer/parquet.py
@@ -41,4 +41,6 @@ def write_data(self, task: DocumentBatch, file_path: str) -> None:
 
         # Add any additional kwargs, allowing them to override defaults
         write_kwargs.update(self.write_kwargs)
-        df.to_parquet(file_path, **write_kwargs)
+        # Pop storage_options as we're directly passing the filesystem to the writer
+        write_kwargs.pop("storage_options", None)
+        df.to_parquet(file_path, filesystem=self.fs, **write_kwargs)