Skip to content

Commit 5c2b2f6

Browse files
authored
reconcile refactor (#20)
* reconcile refactor * PR updates * Update cli.py
1 parent 4aa5182 commit 5c2b2f6

File tree

7 files changed

+94
-255
lines changed

7 files changed

+94
-255
lines changed

dsaps/cli.py

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414
logger = structlog.get_logger()
1515

1616

17+
def validate_path(ctx, param, value):
18+
"""Validates th formatting of The submitted path"""
19+
if value[-1] == '/':
20+
return value
21+
else:
22+
raise click.BadParameter('Include / at the end of the path.')
23+
24+
1725
@click.group(chain=True)
1826
@click.option('--url', envvar='DSPACE_URL', required=True,)
1927
@click.option('-e', '--email', envvar='TEST_EMAIL', required=True,
@@ -51,11 +59,13 @@ def main(ctx, url, email, password):
5159

5260
@main.command()
5361
@click.option('-m', '--metadata-csv', required=True,
54-
type=click.Path(exists=True),
55-
help='The full path to the CSV file of metadata for the items.')
56-
@click.option('--field-map', required=True, type=click.Path(exists=True),
57-
help='Path to JSON field mapping file')
58-
@click.option('-d', '--directory', required=True,
62+
type=click.Path(exists=True, file_okay=True, dir_okay=False),
63+
help='The path to the CSV file of metadata for the items.')
64+
@click.option('--field-map', required=True,
65+
type=click.Path(exists=True, file_okay=True, dir_okay=False),
66+
help='The path to JSON field mapping file.')
67+
@click.option('-d', '--content-directory', required=True,
68+
type=click.Path(exists=True, dir_okay=True, file_okay=False),
5969
help='The full path to the content, either a directory of files '
6070
'or a URL for the storage location.')
6171
@click.option('-t', '--file-type',
@@ -67,11 +77,11 @@ def main(ctx, url, email, password):
6777
help='The handle of the collection to which items are being '
6878
'added.', default=None)
6979
@click.pass_context
70-
def additems(ctx, metadata_csv, field_map, directory, file_type, ingest_report,
71-
collection_handle):
80+
def additems(ctx, metadata_csv, field_map, content_directory, file_type,
81+
ingest_report, collection_handle):
7282
"""Adds items to a specified collection from a metadata CSV, a field
7383
mapping file, and a directory of files. May be run in conjunction with the
74-
newcollection CLI commands."""
84+
newcollection CLI command."""
7585
client = ctx.obj['client']
7686
start_time = ctx.obj['start_time']
7787
if 'collection_uuid' not in ctx.obj and collection_handle is None:
@@ -87,7 +97,7 @@ def additems(ctx, metadata_csv, field_map, directory, file_type, ingest_report,
8797
mapping = json.load(jsonfile)
8898
collection = Collection.from_csv(metadata, mapping)
8999
for item in collection.items:
90-
item.bitstreams_from_directory(directory, file_type)
100+
item.bitstreams_from_directory(content_directory, file_type)
91101
collection.uuid = collection_uuid
92102
items = collection.post_items(client)
93103
if ingest_report:
@@ -114,20 +124,38 @@ def newcollection(ctx, community_handle, collection_name):
114124
ctx.obj['collection_uuid'] = collection_uuid
115125

116126

117-
# @main.command()
118-
# @click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
119-
# help='The path of the CSV file of metadata.')
120-
# @click.option('-o', '--output_path', prompt='Enter the output path',
121-
# default='', help='The path of the output files, include '
122-
# '/ at the end of the path')
123-
# @click.option('-f', '--file_path', prompt='Enter the path',
124-
# help='The path of the content, a URL or local drive path.'
125-
# 'Include / at the end of a local drive path.')
126-
# @click.option('-t', '--file_type', prompt='Enter the file type',
127-
# help='The file type to be uploaded.')
128-
# def reconcile(metadata_csv, file_path, file_type, output_path):
129-
# workflows.reconcile_files_and_metadata(metadata_csv, output_path,
130-
# file_path, file_type)
127+
@main.command()
128+
@click.option('-m', '--metadata-csv', required=True,
129+
type=click.Path(exists=True, file_okay=True, dir_okay=False),
130+
help='The path of the CSV file of metadata.')
131+
@click.option('-o', '--output-directory',
132+
type=click.Path(exists=True, file_okay=False),
133+
default=f'{os.getcwd()}/', callback=validate_path,
134+
help='The path of the output files, include / at the end of the '
135+
'path.')
136+
@click.option('-d', '--content-directory', required=True,
137+
help='The full path to the content, either a directory of files '
138+
'or a URL for the storage location.')
139+
@click.option('-t', '--file-type',
140+
help='The file type to be uploaded, if limited to one file '
141+
'type.', default='*')
142+
def reconcile(metadata_csv, output_directory, content_directory, file_type):
143+
"""Runs a reconciliation of the specified files and metadata that produces
144+
reports of files with no metadata, metadata with no files, metadata
145+
matched to files, and an updated version of the metadata CSV with only
146+
the records that have matching files."""
147+
file_ids = helpers.create_file_list(content_directory, file_type)
148+
metadata_ids = helpers.create_metadata_id_list(metadata_csv)
149+
metadata_matches = helpers.match_metadata_to_files(file_ids, metadata_ids)
150+
file_matches = helpers.match_files_to_metadata(file_ids, metadata_ids)
151+
no_files = set(metadata_ids) - set(metadata_matches)
152+
no_metadata = set(file_ids) - set(file_matches)
153+
helpers.create_csv_from_list(no_metadata, f'{output_directory}no_metadata')
154+
helpers.create_csv_from_list(no_files, f'{output_directory}no_files')
155+
helpers.create_csv_from_list(metadata_matches,
156+
f'{output_directory}metadata_matches')
157+
helpers.update_metadata_csv(metadata_csv, output_directory,
158+
metadata_matches)
131159

132160

133161
if __name__ == '__main__':

dsaps/helpers.py

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,6 @@
22
import glob
33
import os
44

5-
import structlog
6-
7-
8-
logger = structlog.get_logger()
9-
105

116
def create_csv_from_list(list_name, output):
127
"""Creates CSV file from list content."""
@@ -17,14 +12,11 @@ def create_csv_from_list(list_name, output):
1712
writer.writerow([item])
1813

1914

20-
def create_file_dict(file_path, file_type):
21-
"""Creates a dict of file IDs and file paths."""
15+
def create_file_list(file_path, file_type):
16+
"""Creates a list of file names."""
2217
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
23-
file_dict = {}
24-
for file in files:
25-
file_name = os.path.splitext(os.path.basename(file))[0]
26-
file_dict[file_name] = file
27-
return file_dict
18+
file_list = [os.path.basename(file) for file in files]
19+
return file_list
2820

2921

3022
def create_ingest_report(items, file_name):
@@ -43,37 +35,32 @@ def create_metadata_id_list(metadata_csv):
4335
metadata_ids = []
4436
with open(metadata_csv) as csvfile:
4537
reader = csv.DictReader(csvfile)
46-
for row in [r for r in reader if r['file_identifier'] != '']:
47-
metadata_ids.append(row['file_identifier'])
38+
metadata_ids = [row['file_identifier'] for row in reader
39+
if row['file_identifier'] != '']
4840
return metadata_ids
4941

5042

51-
def match_files_to_metadata(file_dict, metadata_ids):
43+
def match_files_to_metadata(file_list, metadata_ids):
5244
"""Creates a list of files matched to metadata records."""
53-
file_matches = []
54-
for file_id, v in file_dict.items():
55-
for metadata_id in [m for m in metadata_ids
56-
if file_id.startswith(m)]:
57-
file_matches.append(file_id)
45+
file_matches = [file_id for metadata_id in metadata_ids
46+
for file_id in file_list
47+
if file_id.startswith(metadata_id)]
5848
return file_matches
5949

6050

61-
def match_metadata_to_files(file_dict, metadata_ids):
51+
def match_metadata_to_files(file_list, metadata_ids):
6252
"""Creates a list of metadata records matched to files."""
63-
metadata_matches = []
64-
for metadata_id in metadata_ids:
65-
for file_id in [f for f in file_dict
66-
if f.startswith(metadata_id)]:
67-
metadata_matches.append(metadata_id)
53+
metadata_matches = [metadata_id for f in file_list for metadata_id in
54+
metadata_ids if f.startswith(metadata_id)]
6855
return metadata_matches
6956

7057

71-
def update_metadata_csv(metadata_csv, output_path, metadata_matches):
58+
def update_metadata_csv(metadata_csv, output_directory, metadata_matches):
7259
"""Creates an updated CSV of metadata records with matching files."""
7360
with open(metadata_csv) as csvfile:
7461
reader = csv.DictReader(csvfile)
7562
upd_md_file_name = f'updated-{os.path.basename(metadata_csv)}'
76-
with open(f'{output_path}{upd_md_file_name}', 'w') as updated_csv:
63+
with open(f'{output_directory}{upd_md_file_name}', 'w') as updated_csv:
7764
writer = csv.DictWriter(updated_csv, fieldnames=reader.fieldnames)
7865
writer.writeheader()
7966
for row in reader:

dsaps/metadata.py

Lines changed: 0 additions & 78 deletions
This file was deleted.

tests/conftest.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pytest
66
import requests_mock
77

8-
from dsaps import metadata, models
8+
from dsaps import models
99

1010

1111
@pytest.fixture()
@@ -41,14 +41,6 @@ def aspace_delimited_csv():
4141
yield reader
4242

4343

44-
@pytest.fixture()
45-
def json_metadata_delim():
46-
json_metadata = metadata.create_json_metadata(
47-
'tests/fixtures/metadata_delim.csv', 'delimited'
48-
)
49-
return json_metadata
50-
51-
5244
@pytest.fixture()
5345
def aspace_mapping():
5446
with open('config/aspace_mapping.json') as f:
@@ -63,14 +55,6 @@ def standard_mapping():
6355
yield mapping
6456

6557

66-
@pytest.fixture()
67-
def json_metadata_num_col():
68-
json_metadata = metadata.create_json_metadata(
69-
'tests/fixtures/metadata_num_col.csv', 'num_columns'
70-
)
71-
return json_metadata
72-
73-
7458
@pytest.fixture()
7559
def output_dir(tmp_path):
7660
output_dir = tmp_path / 'output'
@@ -84,7 +68,7 @@ def runner():
8468

8569

8670
@pytest.fixture(autouse=True)
87-
def web_mock(input_dir):
71+
def web_mock():
8872
with requests_mock.Mocker() as m:
8973
cookies = {'JSESSIONID': '11111111'}
9074
m.post('mock://example.com/login', cookies=cookies)

tests/test_cli.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def test_additems(runner, input_dir):
1212
'tests/fixtures/metadata_delim.csv',
1313
'--field-map',
1414
'tests/fixtures/standard_mapping.json',
15-
'--directory', input_dir,
15+
'--content-directory', input_dir,
1616
'--file-type', 'pdf',
1717
'--collection-handle', '333.3333'])
1818
assert result.exit_code == 0
@@ -28,7 +28,7 @@ def test_additems(runner, input_dir):
2828
'tests/fixtures/metadata_delim.csv',
2929
'--field-map',
3030
'tests/fixtures/standard_mapping.json',
31-
'--directory', input_dir,
31+
'--content-directory', input_dir,
3232
'--file-type', 'pdf'])
3333
assert result.exit_code == 0
3434

@@ -45,16 +45,17 @@ def test_newcollection(runner, input_dir):
4545
assert result.exit_code == 0
4646

4747

48-
# def test_reconcile(runner, input_dir, output_dir):
49-
# """Test reconcile command."""
50-
# result = runner.invoke(main,
51-
# ['--url', 'mock://example.com/',
52-
# '--email', '[email protected]',
53-
# '--password', '1234',
54-
# 'reconcile',
55-
# '--metadata_csv', 'tests/fixtures/metadata_delim.csv',
56-
# '--file_path', input_dir,
57-
# '--file_type', 'pdf',
58-
# '--output_path', output_dir
59-
# ])
60-
# assert result.exit_code == 0
48+
def test_reconcile(runner, input_dir, output_dir):
49+
"""Test reconcile command."""
50+
result = runner.invoke(main,
51+
['--url', 'mock://example.com/',
52+
'--email', '[email protected]',
53+
'--password', '1234',
54+
'reconcile',
55+
'--metadata-csv',
56+
'tests/fixtures/metadata_delim.csv',
57+
'--output-directory', output_dir,
58+
'--content-directory', input_dir,
59+
'--file-type', 'pdf'
60+
])
61+
assert result.exit_code == 0

0 commit comments

Comments
 (0)