Skip to content

Commit df14096

Browse files
authored
[HWORKS-2317] Add dataset api list function (#643)
1 parent 023318e commit df14096

File tree

19 files changed

+215
-103
lines changed

19 files changed

+215
-103
lines changed

java/beam/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<artifactId>hsfs-parent</artifactId>
77
<groupId>com.logicalclocks</groupId>
8-
<version>4.5.0-RC5</version>
8+
<version>4.5.0-RC6</version>
99
</parent>
1010
<modelVersion>4.0.0</modelVersion>
1111

java/flink/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<artifactId>hsfs-parent</artifactId>
77
<groupId>com.logicalclocks</groupId>
8-
<version>4.5.0-RC5</version>
8+
<version>4.5.0-RC6</version>
99
</parent>
1010
<modelVersion>4.0.0</modelVersion>
1111

java/hsfs/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<artifactId>hsfs-parent</artifactId>
77
<groupId>com.logicalclocks</groupId>
8-
<version>4.5.0-RC5</version>
8+
<version>4.5.0-RC6</version>
99
</parent>
1010
<modelVersion>4.0.0</modelVersion>
1111

java/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
<groupId>com.logicalclocks</groupId>
88
<artifactId>hsfs-parent</artifactId>
99
<packaging>pom</packaging>
10-
<version>4.5.0-RC5</version>
10+
<version>4.5.0-RC6</version>
1111
<modules>
1212
<module>hsfs</module>
1313
<module>spark</module>

java/spark/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
<parent>
2323
<artifactId>hsfs-parent</artifactId>
2424
<groupId>com.logicalclocks</groupId>
25-
<version>4.5.0-RC5</version>
25+
<version>4.5.0-RC6</version>
2626
</parent>
2727
<modelVersion>4.0.0</modelVersion>
2828

python/auto_doc.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,6 @@
152152
"delete",
153153
"path_exists",
154154
"rm",
155-
"list",
156-
"list_files",
157155
],
158156
),
159157
},
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#
2+
# Copyright 2025 Hopsworks AB
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
from __future__ import annotations
18+
19+
from typing import Any, Dict, List
20+
21+
import humps
22+
23+
24+
class Dataset:
25+
def __init__(
26+
self,
27+
id,
28+
name,
29+
dataset_type,
30+
attributes: Dict[str, Any],
31+
**kwargs,
32+
) -> None:
33+
self._id = id
34+
self._name = name
35+
self._description = kwargs.get("description", None)
36+
self._dataset_type = dataset_type
37+
self._path = attributes["path"]
38+
39+
@classmethod
40+
def from_response_json(cls, json_dict: Dict[str, Any]) -> List[Dataset]:
41+
json_decamelized = humps.decamelize(json_dict)["items"]
42+
for dataset in json_decamelized:
43+
_ = dataset.pop("type", None)
44+
return [cls(**dataset) for dataset in json_decamelized]
45+
@property
46+
def id(self) -> str:
47+
return self._id
48+
49+
@property
50+
def name(self) -> str:
51+
return self._name
52+
53+
@property
54+
def description(self) -> str:
55+
return self._description
56+
57+
@property
58+
def dataset_type(self) -> str:
59+
return self._dataset_type
60+
61+
@property
62+
def path(self) -> str:
63+
return self._path

python/hopsworks_common/core/dataset_api.py

Lines changed: 54 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@
2424
import shutil
2525
import time
2626
from concurrent.futures import ThreadPoolExecutor, wait
27-
from typing import Literal, Optional, Union
27+
from typing import List, Literal, Optional, Type, Union
2828

2929
from hopsworks_common import client, tag, usage, util
3030
from hopsworks_common.client.exceptions import DatasetException, RestAPIError
31-
from hopsworks_common.core import inode
31+
from hopsworks_common.core import dataset, inode
3232
from tqdm.auto import tqdm
3333

3434

@@ -638,68 +638,80 @@ def upload_feature_group(self, feature_group, path, dataframe):
638638
chunk_number += 1
639639

640640
@usage.method_logger
641-
def list_files(self, path: str, offset: int, limit: int):
642-
"""**Deprecated**
641+
def list(self, path: str, offset: int = 0, limit: int = 1000) -> List[str]:
642+
"""List the files and directories from a path in the Hopsworks Filesystem.
643+
644+
```python
645+
646+
import hopsworks
647+
648+
project = hopsworks.login()
649+
650+
dataset_api = project.get_dataset_api()
651+
652+
# list all files in the Resources dataset
653+
files = dataset_api.list("/Resources")
654+
655+
# list all datasets in the project
656+
files = dataset_api.list("/")
657+
658+
```
659+
# Arguments
660+
path: path in Hopsworks filesystem to the directory
661+
offset: the number of entities to skip
662+
limit: max number of the returned entities
663+
# Returns
664+
`list[str]`: List of path to files and directories in the provided path
665+
# Raises
666+
`hopsworks.client.exceptions.RestAPIError`: If the backend encounters an error when handling the request
667+
"""
668+
_client = client.get_instance()
669+
# Normalize path so we can check if the path refers to the root or not
670+
# That is needed as different backend entities are returned depending on if it is a top level dataset or a subdirectory
671+
normalized_path = os.path.normpath(path)
672+
if normalized_path == "/":
673+
normalized_path = ""
674+
cls = dataset.Dataset
675+
else:
676+
cls = inode.Inode
677+
678+
count, items = self._list_dataset_path(normalized_path, cls, offset=offset, limit=limit)
643679

680+
files = []
681+
for item in items:
682+
files.append(util.convert_to_project_rel_path(item.path, _client._project_name))
683+
return files
684+
685+
@usage.method_logger
686+
def _list_dataset_path(self, path: str, cls: Union[Type[dataset.Dataset], Type[inode.Inode]], offset: int = 0, limit: int = 1000, sort_by: str = "ID:asc"):
687+
"""
644688
List contents of a directory in the Hopsworks Filesystem.
645689
646690
# Arguments
647691
path: path to the directory to list the contents of.
648692
offset: the number of Inodes to skip.
649693
limit: max number of the returned Inodes.
650694
# Returns
651-
`tuple[int, list[hopsworks.core.inode.Inode]]`: count of Inodes in the directory and the list of them.
695+
`tuple[int, tuple[int, list[inode.Inode]] | tuple[int, list[Dataset]]]`: count of Dataset or Inodes and objects
652696
"""
653697
_client = client.get_instance()
654698
path_params = [
655699
"project",
656700
_client._project_id,
657701
"dataset",
658-
path[(path.index("/", 10) + 1) :],
702+
path,
659703
]
660704
query_params = {
661705
"action": "listing",
662706
"offset": offset,
663707
"limit": limit,
664-
"sort_by": "ID:asc",
708+
"sort_by": sort_by,
709+
"expand": "inodes",
665710
}
666711

667-
inode_lst = _client._send_request("GET", path_params, query_params)
668-
669-
return inode_lst["count"], inode.Inode.from_response_json(inode_lst)
670-
671-
@usage.method_logger
672-
def list(
673-
self,
674-
remote_path: str,
675-
sort_by: str | None = None,
676-
offset: int = 0,
677-
limit: int = 1000,
678-
):
679-
"""**Deprecated**
680-
681-
List contents of a directory in the Hopsworks Filesystem.
712+
items = _client._send_request("GET", path_params, query_params)
682713

683-
# Arguments
684-
remote_path: path to the directory to list the contents of.
685-
sort_by: sort string, for example `"ID:asc"`.
686-
offset: the number of entities to skip.
687-
limit: max number of the returned entities.
688-
"""
689-
# this method is probably to be merged with list_files
690-
# they seem to handle paths differently and return different results, which prevents the merge at the moment (2024-09-03), due to the requirement of backwards-compatibility
691-
_client = client.get_instance()
692-
path_params = ["project", _client._project_id, "dataset", remote_path]
693-
query_params = {
694-
"action": "listing",
695-
"sort_by": sort_by,
696-
"limit": limit,
697-
"offset": offset,
698-
}
699-
headers = {"content-type": "application/json"}
700-
return _client._send_request(
701-
"GET", path_params, headers=headers, query_params=query_params
702-
)
714+
return items["count"], cls.from_response_json(items)
703715

704716
@usage.method_logger
705717
def read_content(self, path: str, dataset_type: str = "DATASET"):

python/hopsworks_common/core/inode.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,13 @@ def __init__(
2929
tags: Optional[Dict[str, Any]] = None,
3030
**kwargs,
3131
) -> None:
32-
self._path = attributes["path"]
32+
self._name = attributes.get("name", None)
33+
self._dir = attributes.get("dir", False)
34+
self._owner = attributes.get("owner", None)
35+
self._path = attributes.get("path", None)
36+
self._permission = attributes.get("permission", None)
37+
self._modification_time = attributes.get("modification_time", None)
38+
self._under_construction = attributes.get("under_construction", None)
3339

3440
@classmethod
3541
def from_response_json(cls, json_dict: Dict[str, Any]) -> List[Inode]:
@@ -38,6 +44,30 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> List[Inode]:
3844
_ = inode.pop("type", None)
3945
return [cls(**inode) for inode in json_decamelized]
4046

47+
@property
48+
def name(self) -> str:
49+
return self._name
50+
51+
@property
52+
def dir(self) -> str:
53+
return self._dir
54+
55+
@property
56+
def owner(self) -> str:
57+
return self._owner
58+
4159
@property
4260
def path(self) -> str:
4361
return self._path
62+
63+
@property
64+
def permission(self) -> str:
65+
return self._permission
66+
67+
@property
68+
def modification_time(self) -> str:
69+
return self._modification_time
70+
71+
@property
72+
def under_construction(self) -> str:
73+
return self._under_construction

python/hopsworks_common/util.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,14 @@ def convert_to_abs(path, current_proj_name):
438438
return path
439439

440440

441+
def convert_to_project_rel_path(path, current_proj_name):
442+
abs_project_prefix = "/Projects/{}".format(current_proj_name)
443+
if path.startswith(abs_project_prefix):
444+
return path.replace(abs_project_prefix, "")
445+
else:
446+
return path
447+
448+
441449
def validate_job_conf(config, project_name):
442450
# User is required to set the appPath programmatically after getting the configuration
443451
if (

0 commit comments

Comments
 (0)