|
24 | 24 | import shutil |
25 | 25 | import time |
26 | 26 | from concurrent.futures import ThreadPoolExecutor, wait |
27 | | -from typing import Literal, Optional, Union |
| 27 | +from typing import List, Literal, Optional, Type, Union |
28 | 28 |
|
29 | 29 | from hopsworks_common import client, tag, usage, util |
30 | 30 | from hopsworks_common.client.exceptions import DatasetException, RestAPIError |
31 | | -from hopsworks_common.core import inode |
| 31 | +from hopsworks_common.core import dataset, inode |
32 | 32 | from tqdm.auto import tqdm |
33 | 33 |
|
34 | 34 |
|
@@ -638,68 +638,80 @@ def upload_feature_group(self, feature_group, path, dataframe): |
638 | 638 | chunk_number += 1 |
639 | 639 |
|
640 | 640 | @usage.method_logger |
641 | | - def list_files(self, path: str, offset: int, limit: int): |
642 | | - """**Deprecated** |
| 641 | + def list(self, path: str, offset: int = 0, limit: int = 1000) -> List[str]: |
| 642 | + """List the files and directories from a path in the Hopsworks Filesystem. |
| 643 | +
|
| 644 | + ```python |
| 645 | +
|
| 646 | + import hopsworks |
| 647 | +
|
| 648 | + project = hopsworks.login() |
| 649 | +
|
| 650 | + dataset_api = project.get_dataset_api() |
| 651 | +
|
| 652 | + # list all files in the Resources dataset |
| 653 | + files = dataset_api.list("/Resources") |
| 654 | +
|
| 655 | + # list all datasets in the project |
| 656 | + files = dataset_api.list("/") |
| 657 | +
|
| 658 | + ``` |
| 659 | + # Arguments |
| 660 | + path: path in Hopsworks filesystem to the directory |
| 661 | + offset: the number of entities to skip |
| 662 | + limit: max number of the returned entities |
| 663 | + # Returns |
| 664 | + `list[str]`: List of path to files and directories in the provided path |
| 665 | + # Raises |
| 666 | + `hopsworks.client.exceptions.RestAPIError`: If the backend encounters an error when handling the request |
| 667 | + """ |
| 668 | + _client = client.get_instance() |
| 669 | + # Normalize path so we can check if the path refers to the root or not |
| 670 | + # That is needed as different backend entities are returned depending on if it is a top level dataset or a subdirectory |
| 671 | + normalized_path = os.path.normpath(path) |
| 672 | + if normalized_path == "/": |
| 673 | + normalized_path = "" |
| 674 | + cls = dataset.Dataset |
| 675 | + else: |
| 676 | + cls = inode.Inode |
| 677 | + |
| 678 | + count, items = self._list_dataset_path(normalized_path, cls, offset=offset, limit=limit) |
643 | 679 |
|
| 680 | + files = [] |
| 681 | + for item in items: |
| 682 | + files.append(util.convert_to_project_rel_path(item.path, _client._project_name)) |
| 683 | + return files |
| 684 | + |
| 685 | + @usage.method_logger |
| 686 | + def _list_dataset_path(self, path: str, cls: Union[Type[dataset.Dataset], Type[inode.Inode]], offset: int = 0, limit: int = 1000, sort_by: str = "ID:asc"): |
| 687 | + """ |
644 | 688 | List contents of a directory in the Hopsworks Filesystem. |
645 | 689 |
|
646 | 690 | # Arguments |
647 | 691 | path: path to the directory to list the contents of. |
648 | 692 | offset: the number of Inodes to skip. |
649 | 693 | limit: max number of the returned Inodes. |
650 | 694 | # Returns |
651 | | - `tuple[int, list[hopsworks.core.inode.Inode]]`: count of Inodes in the directory and the list of them. |
| 695 | + `tuple[int, tuple[int, list[inode.Inode]] | tuple[int, list[Dataset]]]`: count of Dataset or Inodes and objects |
652 | 696 | """ |
653 | 697 | _client = client.get_instance() |
654 | 698 | path_params = [ |
655 | 699 | "project", |
656 | 700 | _client._project_id, |
657 | 701 | "dataset", |
658 | | - path[(path.index("/", 10) + 1) :], |
| 702 | + path, |
659 | 703 | ] |
660 | 704 | query_params = { |
661 | 705 | "action": "listing", |
662 | 706 | "offset": offset, |
663 | 707 | "limit": limit, |
664 | | - "sort_by": "ID:asc", |
| 708 | + "sort_by": sort_by, |
| 709 | + "expand": "inodes", |
665 | 710 | } |
666 | 711 |
|
667 | | - inode_lst = _client._send_request("GET", path_params, query_params) |
668 | | - |
669 | | - return inode_lst["count"], inode.Inode.from_response_json(inode_lst) |
670 | | - |
671 | | - @usage.method_logger |
672 | | - def list( |
673 | | - self, |
674 | | - remote_path: str, |
675 | | - sort_by: str | None = None, |
676 | | - offset: int = 0, |
677 | | - limit: int = 1000, |
678 | | - ): |
679 | | - """**Deprecated** |
680 | | -
|
681 | | - List contents of a directory in the Hopsworks Filesystem. |
| 712 | + items = _client._send_request("GET", path_params, query_params) |
682 | 713 |
|
683 | | - # Arguments |
684 | | - remote_path: path to the directory to list the contents of. |
685 | | - sort_by: sort string, for example `"ID:asc"`. |
686 | | - offset: the number of entities to skip. |
687 | | - limit: max number of the returned entities. |
688 | | - """ |
689 | | - # this method is probably to be merged with list_files |
690 | | - # they seem to handle paths differently and return different results, which prevents the merge at the moment (2024-09-03), due to the requirement of backwards-compatibility |
691 | | - _client = client.get_instance() |
692 | | - path_params = ["project", _client._project_id, "dataset", remote_path] |
693 | | - query_params = { |
694 | | - "action": "listing", |
695 | | - "sort_by": sort_by, |
696 | | - "limit": limit, |
697 | | - "offset": offset, |
698 | | - } |
699 | | - headers = {"content-type": "application/json"} |
700 | | - return _client._send_request( |
701 | | - "GET", path_params, headers=headers, query_params=query_params |
702 | | - ) |
| 714 | + return items["count"], cls.from_response_json(items) |
703 | 715 |
|
704 | 716 | @usage.method_logger |
705 | 717 | def read_content(self, path: str, dataset_type: str = "DATASET"): |
|
0 commit comments