diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9a54a41 --- /dev/null +++ b/LICENSE @@ -0,0 +1,28 @@ +BSD 3-Clause License + +Copyright (c) 2023, Oasis Loss Modelling Framework + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/docker-compose.yml b/docker-compose.yml index 0bd47d4..7ad02c3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,7 @@ version: "3.8" services: localstack: container_name: "${LOCALSTACK_DOCKER_NAME-localstack_main}" - image: localstack/localstack + image: localstack/localstack:3 ports: - "127.0.0.1:4566:4566" # LocalStack Gateway - "127.0.0.1:4510-4559:4510-4559" # external services port range diff --git a/oasis_data_manager/__init__.py b/oasis_data_manager/__init__.py index fc79d63..020ed73 100644 --- a/oasis_data_manager/__init__.py +++ b/oasis_data_manager/__init__.py @@ -1 +1 @@ -__version__ = '0.2.1' +__version__ = '0.2.2' diff --git a/oasis_data_manager/filestore/backends/aws_s3.py b/oasis_data_manager/filestore/backends/aws_s3.py index 55c873f..25b2985 100755 --- a/oasis_data_manager/filestore/backends/aws_s3.py +++ b/oasis_data_manager/filestore/backends/aws_s3.py @@ -256,10 +256,18 @@ def url(self, object_name, parameters=None, expire=None): else: return self._strip_signing_parameters(url) + def put(self, reference, filename=None, subdir="", suffix=None, arcname=None): + # Prepend location so the returned key is bucket-relative (location/subdir/file). + # The server's CopyObject and is_in_bucket() both use the raw bucket key, so the + # returned value must include the location prefix, not be relative to it. + effective_subdir = os.path.join(self.location, subdir) if self.location else subdir + return super().put(reference, filename=filename, subdir=effective_subdir, suffix=suffix, arcname=arcname) + def get_storage_url(self, filename=None, suffix="tar.gz", encode_params=True): filename = ( filename if filename is not None else self._get_unique_filename(suffix) ) + key = os.path.join(self.location, filename) if self.location else filename params = {} if encode_params: @@ -279,8 +287,8 @@ def get_storage_url(self, filename=None, suffix="tar.gz", encode_params=True): params["endpoint"] = self.endpoint_url return ( - filename, - f"s3://{os.path.join(self.root_dir, filename)}{'?' if params else ''}{parse.urlencode(params) if params else ''}", + key, + f"s3://{os.path.join(self.root_dir, key)}{'?' if params else ''}{parse.urlencode(params) if params else ''}", ) @contextlib.contextmanager diff --git a/setup.py b/setup.py index c796eb6..d2aa201 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,8 @@ def get_optional_requirements(): exclude_package_data={ "": ["__pycache__", "*.py[co]"], }, + license="BSD-3-Clause", + license_files=["LICENSE"], description="", long_description="", long_description_content_type="text/markdown", diff --git a/tests/df_reader/test_geo.py b/tests/df_reader/test_geo.py index fbe1388..56f4ab4 100644 --- a/tests/df_reader/test_geo.py +++ b/tests/df_reader/test_geo.py @@ -1,9 +1,10 @@ +import zipfile from tempfile import NamedTemporaryFile -import geodatasets import geopandas as gpd import pandas as pd import pytest +import requests # type: ignore[import-untyped] from shapely.geometry import Point from oasis_data_manager.df_reader.reader import ( @@ -14,6 +15,8 @@ ) from oasis_data_manager.filestore.backends.local import LocalStorage +NYBB_URL = "https://raw.githubusercontent.com/geopandas/geodatasets/main/data_backup/nybb_16a.zip" + READERS = [ OasisPandasReaderCSV, OasisPandasReaderParquet, @@ -25,14 +28,26 @@ storage = LocalStorage("/") +@pytest.fixture(scope="session") +def nybb_path(tmp_path_factory): + tmp = tmp_path_factory.mktemp("nybb") + zip_dest = tmp / "nybb_16a.zip" + r = requests.get(NYBB_URL, timeout=30) + r.raise_for_status() + zip_dest.write_bytes(r.content) + with zipfile.ZipFile(zip_dest) as zf: + zf.extractall(tmp) + return str(next(tmp.rglob("*.shp"))) + + @pytest.fixture -def df(): +def df(nybb_path): """ df representing data with a long/lat, here we pull some values from within the nybb dataset used in tests to create some values in out expected boroughs. """ - b = [int(x) for x in gpd.read_file(geodatasets.get_path("nybb")).total_bounds] + b = [int(x) for x in gpd.read_file(nybb_path).total_bounds] return pd.DataFrame( [ @@ -46,7 +61,7 @@ def df(): @pytest.mark.parametrize("reader", READERS) -def test_read__expected_pandas_dataframe(reader, df): +def test_read__expected_pandas_dataframe(reader, df, nybb_path): suffix = ".csv" if "csv" in reader.__name__.lower() else ".parquet" with NamedTemporaryFile(suffix=suffix) as file: if suffix == ".csv": @@ -56,7 +71,7 @@ def test_read__expected_pandas_dataframe(reader, df): result = ( reader(file.name, storage) - .apply_geo(geodatasets.get_path("nybb")) + .apply_geo(nybb_path) .as_pandas() ) @@ -70,7 +85,7 @@ def test_read__expected_pandas_dataframe(reader, df): @pytest.mark.parametrize("reader", READERS) -def test_read__expected_pandas_dataframe__drop_geo(reader, df): +def test_read__expected_pandas_dataframe__drop_geo(reader, df, nybb_path): suffix = ".csv" if "csv" in reader.__name__.lower() else ".parquet" with NamedTemporaryFile(suffix=suffix) as file: if suffix == ".csv": @@ -83,7 +98,7 @@ def test_read__expected_pandas_dataframe__drop_geo(reader, df): file.name, storage, ) - .apply_geo(geodatasets.get_path("nybb"), drop_geo=False) + .apply_geo(nybb_path, drop_geo=False) .as_pandas() ) @@ -123,7 +138,7 @@ def test_read__expected_pandas_dataframe__drop_geo(reader, df): @pytest.mark.parametrize("reader", READERS) -def test_read__gql_filter__expected_pandas_dataframe(reader, df): +def test_read__gql_filter__expected_pandas_dataframe(reader, df, nybb_path): suffix = ".csv" if "csv" in reader.__name__.lower() else ".parquet" with NamedTemporaryFile(suffix=suffix) as file: if suffix == ".csv": @@ -133,7 +148,7 @@ def test_read__gql_filter__expected_pandas_dataframe(reader, df): result = ( reader(file.name, storage) - .apply_geo(geodatasets.get_path("nybb")) + .apply_geo(nybb_path) .filter([lambda x: x[x["longitude"] == 1028825]]) .as_pandas() ) @@ -148,7 +163,7 @@ def test_read__gql_filter__expected_pandas_dataframe(reader, df): @pytest.mark.parametrize("reader", READERS) -def test_read__shape_file__invalid(reader, df, caplog): +def test_read__shape_file__invalid(reader, df, nybb_path, caplog): df = df.rename(columns={"longitude": "lon", "latitude": "lat"}) suffix = ".csv" if "csv" in reader.__name__.lower() else ".parquet" @@ -163,7 +178,7 @@ def test_read__shape_file__invalid(reader, df, caplog): file.name, storage, ) - .apply_geo(geodatasets.get_path("nybb")) + .apply_geo(nybb_path) .as_pandas() ) diff --git a/tests/filestorage/test_aws.py b/tests/filestorage/test_aws.py index b6fc044..ba15cb0 100644 --- a/tests/filestorage/test_aws.py +++ b/tests/filestorage/test_aws.py @@ -41,6 +41,93 @@ def test_storage_constructed_from_config_matches_initial(): assert result.bucket_name == storage.bucket_name +def test_location_does_not_affect_root_dir(): + # root_dir is bucket-scoped only; location is applied at put/get_storage_url time + storage = make_storage(location="oasis") + + assert storage.root_dir == storage.bucket_name + + +def test_location_round_trips_through_config(): + storage = make_storage(location="oasis", root_dir="subdir") + + result = get_storage_from_config(storage.to_config()) + + assert isinstance(result, AwsS3Storage) + assert result.root_dir == storage.root_dir + assert result.location == storage.location + + +def test_location_put_returns_bucket_relative_key(): + # The returned key must be the raw S3 key (bucket-relative) so the server's + # is_in_bucket() lookup and CopyObject both resolve correctly. + storage = make_storage(location="oasis") + + with tempfile.NamedTemporaryFile("w", suffix=".txt") as f: + f.write("content") + f.flush() + + stored_path = storage.put(f.name, "test_file.txt") + + assert stored_path == os.path.join("oasis", "test_file.txt") + # The file must actually live at that bucket-relative key + assert storage.fs.isfile(stored_path) + + +def test_location_put_with_subdir(): + storage = make_storage(location="oasis") + + with tempfile.NamedTemporaryFile("w", suffix=".txt") as f: + f.write("content") + f.flush() + + stored_path = storage.put(f.name, "test_file.txt", subdir="run1") + + assert stored_path == os.path.join("oasis", "run1", "test_file.txt") + assert storage.fs.isfile(stored_path) + + +def test_no_location_put_returns_bare_key(): + # Without location, behaviour is unchanged: returned key has no prefix + storage = make_storage() + + with tempfile.NamedTemporaryFile("w", suffix=".txt") as f: + f.write("content") + f.flush() + + stored_path = storage.put(f.name, "test_file.txt") + + assert stored_path == "test_file.txt" + assert storage.fs.isfile(stored_path) + + +def test_location_get_storage_url_includes_prefix(): + storage = make_storage(location="oasis") + + key, url = storage.get_storage_url(filename="myfile.tar.gz") + + assert key == "oasis/myfile.tar.gz" + assert "oasis/myfile.tar.gz" in url + + +def test_location_put_then_get_roundtrip(): + # Worker puts a file, then retrieves it using the returned key — must work + storage = make_storage(location="oasis") + + with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as src: + src.write("hello") + src.flush() + stored_path = storage.put(src.name, "roundtrip.txt") + + with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as dest: + dest_path = dest.name + + result = storage.get(stored_path, dest_path) + assert result == dest_path + with open(dest_path) as f: + assert f.read() == "hello" + + @pytest.mark.parametrize("acl", ["public-read-write", "public-read"]) def test_uploaded_file_has_the_correct_acl(acl): storage = make_storage(default_acl=acl)