Skip to content

Commit df7bd5e

Browse files
authored
Merge pull request #1746 from microbiomedata/1735-referential-ineggrity-checking-of-napa-squads-mongodb-via-sparql-in-fuseki
better differentiation between pure-export and get-study-id-from-filename
2 parents 5146980 + 5a24e62 commit df7bd5e

File tree

2 files changed

+124
-141
lines changed

2 files changed

+124
-141
lines changed

project.Makefile

Lines changed: 121 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -95,14 +95,14 @@ local/usage_template.tsv: nmdc_schema/nmdc_materialized_patterns.yaml # replaces
9595
--report-style exhaustive
9696

9797
examples/output/Biosample-exhaustive_report.yaml: src/data/valid/Biosample-exhasutive.yaml # replaces misspelled Biosample-exhasutive_report target
98-
poetry run exhaustion-check \
98+
$(RUN) exhaustion-check \
9999
--class-name Biosample \
100100
--instance-yaml-file $< \
101101
--output-yaml-file $@ \
102102
--schema-path src/schema/nmdc.yaml
103103

104104
examples/output/Biosample-exhasutive-pretty-sorted.yaml: src/data/valid/Biosample-exhasutive.yaml
105-
poetry run pretty-sort-yaml \
105+
$(RUN) pretty-sort-yaml \
106106
-i $< \
107107
-o $@
108108

@@ -259,12 +259,6 @@ local/biosample-slot-range-type-report.tsv: src/schema/nmdc.yaml
259259
--schema-class Biosample
260260

261261
### example of preparing to validate napa squad data
262-
local/nmdc-schema-v8.0.0.yaml:
263-
curl -o $@ https://raw.githubusercontent.com/microbiomedata/nmdc-schema/v8.0.0/nmdc_schema/nmdc_materialized_patterns.yaml
264-
# need to remove lines like this (see_alsos whose values aren't legitimate URIs)
265-
# see_also:
266-
# - MIxS:experimental_factor|additional_info
267-
yq eval-all -i 'del(select(fileIndex == 0) | .. | select(has("see_also")) | .see_also)' $@
268262

269263
local/nmdc-schema-v7.8.0.yaml:
270264
curl -o $@ https://raw.githubusercontent.com/microbiomedata/nmdc-schema/v7.8.0/nmdc_schema/nmdc_materialized_patterns.yaml
@@ -276,169 +270,139 @@ local/nmdc-schema-v7.8.0.yaml:
276270
yq -i 'del(.classes.DataObject.slot_usage.id.pattern)' $@ # kludge modify schema to match data
277271
rm -rf $@.bak
278272

279-
local/nmdc-schema-v8.0.0.owl.ttl: local/nmdc-schema-v8.0.0.yaml
280-
$(RUN) gen-owl --no-use-native-uris $< > $@
281-
282-
283273
local/nmdc-schema-v7.8.0.owl.ttl: local/nmdc-schema-v7.8.0.yaml
284274
$(RUN) gen-owl --no-use-native-uris $< > $@
285275

286-
local/nmdc-sty-11-aygzgv51.yaml:
287-
$(RUN) get-study-related-records \
288-
--api-base-url https://api-napa.microbiomedata.org \
289-
extract-study \
290-
--study-id $(subst nmdc-,nmdc:,$(basename $(notdir $@))) \
291-
--search-orphaned-data-objects \
292-
--output-file $@
293276

294-
295-
### FUSEKI, DOCKER, ETC
277+
## FUSEKI, DOCKER, ETC
296278
# we use Apache's Jena RDF/SPARQL framework
297279
# Jena provides command line tools for accessing RDF *files*
298-
# or you can use a TDB as the data backend
299-
# Fuseki is a web interface for submitting SPARQL queries
300-
# we are foing all of this in docker so you don't have to install any of this software
280+
# we use a Jena TDB2 database as the backend (as opposed to operating over files)
281+
# Fuseki is Jena's web interface for submitting SPARQL queries
282+
# we are doing all of this in docker so you don't have to install any of this system software
301283

302-
.PHONY: thorough-docker-fuseki-cleanup-from-host # this level of cleanup may not be needed ona regular basis
284+
.PHONY: thorough-docker-fuseki-cleanup-from-host # this level of cleanup may not be needed on a regular basis
303285
thorough-docker-fuseki-cleanup-from-host: some-napa-collections-cleanup
304286
- docker compose down
305287
rm -rf local/fuseki-data
306-
rm -rf local/nmdc-data*
307-
rm -rf local/nmdc-tdb2*
308288
rm -rf local/sparql-results/*
309289
rm -rf .venv
310290
docker system prune --force # very aggressive. may delete containers etc that you want but are not currently running
311291

312292
.PHONY: docker-startup-from-host
313293
docker-startup-from-host:
314294
docker compose up --build --detach # --build is only necessary if changes have been made to the Dockerfile
315-
docker-compose run app poetry install
316-
docker-compose run app create-nmdc-tdb2-from-app
317295

318-
# manually: `docker compose exec app bash`
319-
# then you can do any nmdc-schem makefile commands in the 'app' environment
296+
# from host: checkout the desired branch, fetch and pull
297+
# from host: `docker compose exec app bash`
298+
# it's best if there isn't already a ./.venv, especially if the environment wasn't built for Linux
299+
# in container: `poetry install`
300+
# in container: `make build-schema-in-app`
301+
302+
# then you can do any nmdc-schema makefile commands in the 'app' environment
320303

321304
.PHONY: build-schema-in-app
322-
build-schema-in-app:
323-
# # Warning: 'get-study-related-records' is an entry point defined in pyproject.toml, but it's not installed as a script. You may get improper `sys.argv[0]`.
324-
# # The support to run uninstalled scripts will be removed in a future release.
325-
# # Run `poetry install` to resolve and get rid of this message.
326-
# poetry install # it's best if there isn't already a ./.venv, especially if it's not for Linux
305+
build-schema-in-app: pre-build
327306
make squeaky-clean all test
328307

308+
.PHONY: pre-build
309+
pre-build: local/gold-study-ids.yaml create-nmdc-tdb2-from-app
310+
329311
.PHONY: create-nmdc-tdb2-from-app
330312
create-nmdc-tdb2-from-app: # Fuseki will get it's data from this TDB2 database. It starts out empty.
331313
curl \
332314
--user 'admin:password' \
333315
--data 'dbType=tdb&dbName=nmdc-tdb2' \
334316
'http://fuseki:3030/$$/datasets'
335317

336-
## does this with work in both the datasets offline and active states?
337-
#local/nmdc-tdb2-graph-list.tsv:
338-
# tdb2.tdbquery \
339-
# --loc=$(FD_ROOT)/nmdc-tdb2 \
340-
# --query=assets/sparql/tdb-graph-list.rq \
341-
# --results=TSV > $@
342318

343-
# curl -X DELETE \
344-
# --user 'admin:password' \
345-
# http://fuseki:3030/nmdc-tdb2/data?default
319+
## Option 2 of 2 for getting data from MongoDB for Napa QC: get-study-id-from-filename
320+
#
321+
# advantage: can retreive records that have known, chacterized paths to a Study (useful scoping)
322+
# disadvantages:
323+
# some paths to records form some collections aren't implemented yet
324+
# runs slow on Mark's computers in Philadelphia. Running pure-export can reteive more data more quickly, but without any scoping
325+
# the core targets include wildcards in their names,
326+
# but an individual YAML file can be built like this: make local/study-files/nmdc-sty-11-8fb6t785.yaml
327+
# or an explicit subset of YAML files can be built with: make create-study-yaml-files-subset
328+
# or YAML files can be built from a list of study ids (STUDY_IDS) like this: make create-study-yaml-files-from-study-ids-list
346329

347-
# curl -X DELETE \
348-
# --user 'admin:password' \
349-
# http://fuseki:3030/nmdc-tdb2/data?graph=https://w3id.org/nmdc/nmdc
350330

331+
# can't ever be used without generating local/gold-study-ids.yaml first
332+
STUDY_IDS := $(shell yq '.resources.[].id' local/gold-study-ids.yaml | awk '{printf "%s ", $$0} END {print ""}')
351333

352-
.PHONY: docker-compose-down-from-host
353-
docker-compose-down-from-host:
354-
docker compose down
334+
.PHONY: print-discovered-study-ids print-intended-yaml-files
355335

356-
# ----
336+
# can't ever be used without generating local/gold-study-ids.yaml first
337+
print-discovered-study-ids:
338+
@echo $(STUDY_IDS)
357339

358-
.PHONY: print-prefixed-study-ids print-file-list
340+
# Replace colons with hyphens in study IDs
341+
# can't ever be used without generating local/gold-study-ids.yaml first
342+
STUDY_YAML_FILES := $(addsuffix .yaml,$(addprefix local/study-files/,$(subst :,-,$(STUDY_IDS))))
359343

360-
STUDY_IDS := nmdc:sty-11-34xj1150 nmdc:sty-11-5bgrvr62 nmdc:sty-11-5tgfr349 nmdc:sty-11-r2h77870 \
361-
nmdc:sty-11-db67n062 nmdc:sty-11-8xdqsn54 nmdc:sty-11-28tm5d36 nmdc:sty-11-33fbta56 nmdc:sty-11-076c9980 \
362-
nmdc:sty-11-t91cwb40 nmdc:sty-11-aygzgv51 nmdc:sty-11-547rwq94 nmdc:sty-11-zs2syx06 nmdc:sty-11-dcqce727 \
363-
nmdc:sty-11-1t150432 nmdc:sty-11-8fb6t785
344+
.PHONY: all-study-yaml-files
364345

365-
#print-prefixed-study-ids:
366-
# @echo $(STUDY_IDS)
346+
# can't ever be used without generating local/gold-study-ids.yaml first
347+
create-study-yaml-files-from-study-ids-list: $(STUDY_YAML_FILES)
367348

368-
# Replace colons with hyphens in study IDs
369-
STUDY_FILES := $(addsuffix .yaml,$(addprefix local/study-files/,$(subst :,-,$(STUDY_IDS))))
370-
371-
#print-file-list:
372-
# @echo $(STUDY_FILES)
373-
374-
# Napa nmdc:sty-11-aygzgv51 = "production" gold:Gs0114663
375-
376-
# [('nmdc:sty-11-34xj1150', 4443),
377-
# ('nmdc:sty-11-5bgrvr62', 471),
378-
# ('nmdc:sty-11-5tgfr349', 430),
379-
# ('nmdc:sty-11-r2h77870', 416),
380-
# ('nmdc:sty-11-db67n062', 241),
381-
# ('nmdc:sty-11-8xdqsn54', 207),
382-
# ('nmdc:sty-11-28tm5d36', 134),
383-
# ('nmdc:sty-11-33fbta56', 124),
384-
# ('nmdc:sty-11-076c9980', 105),
385-
# ('nmdc:sty-11-t91cwb40', 95),
386-
# ('nmdc:sty-11-aygzgv51', 85),
387-
# ('nmdc:sty-11-547rwq94', 80),
388-
# ('nmdc:sty-11-zs2syx06', 60), # Extracted study nmdc:sty-11-zs2syx06 from the NMDC database in 0:00:01.475736.
389-
# ('nmdc:sty-11-dcqce727', 53), # Extracted study nmdc:sty-11-dcqce727 from the NMDC database in 0:36:39.633116.
390-
# ('nmdc:sty-11-1t150432', 30), # Extracted study nmdc:sty-11-8fb6t785 from the NMDC database in 0:01:04.012420, 0:01:17.337886.
391-
# ('nmdc:sty-11-8fb6t785', 23)] # Extracted study nmdc:sty-11-8fb6t785 from the NMDC database in 0:01:01.963206.
392-
393-
# local/study-files/nmdc-sty-11-34xj1150.yaml
394-
local/study-files/%.yaml: local/nmdc-schema-v8.0.0.yaml
395-
mkdir -p $(@D)
396-
rm -rf study-file-name.txt study-id.txt
397-
echo $@ > study-file-name.txt
398-
echo $(shell poetry run get-study-id-from-filename $$(<study-file-name.txt)) > study-id.txt
399-
# cumbersome! using python script because can't replace just first hyphen with colon with make text function
400-
# then, can't use $@ inside of a make shell call
401-
# we just want to transform $@, like local/study-files/nmdc-sty-11-8fb6t785.yaml to nmdc:sty-11-8fb6t785
402-
date
403-
time $(RUN) get-study-related-records \
404-
--api-base-url https://api-napa.microbiomedata.org \
405-
extract-study \
406-
--study-id $$(<study-id.txt) \
407-
--output-file $@
408-
$(RUN) linkml-validate --schema $< $@ > $@.validation.log.txt
409-
rm -rf study-file-name.txt study-id.txt
410-
411-
create-study-yaml-files: local/study-files/nmdc-sty-11-8fb6t785.yaml \
412-
local/study-files/nmdc-sty-11-1t150432.yaml \
413-
local/study-files/nmdc-sty-11-zs2syx06.yaml
349+
# can't ever be used without generating local/gold-study-ids.yaml first
350+
print-intended-yaml-files: local/gold-study-ids.yaml
351+
@echo $(STUDY_YAML_FILES)
414352

415-
# includes load into fuseki
416-
# not doing any migration here yet
417-
local/study-files/%.ttl: local/nmdc-schema-v8.0.0.yaml create-nmdc-tdb2-from-app create-study-yaml-files
418-
$(RUN) linkml-convert --output $@ --schema $< $(subst .ttl,.yaml,$@)
419-
curl -X \
420-
POST -H "Content-Type: text/turtle" \
421-
--user 'admin:password' \
422-
--data-binary @$@ http://fuseki:3030/nmdc-tdb2/data?graph=https://api-napa.microbiomedata.org
353+
## we can get a report of biosamples per study with the following
354+
## may help predict how long it will take to run study-id-from-filename on a particular study
355+
## will become unnecessary once aggregation queries are available in the napa nmdc-runtime API
356+
local/biosamples-per-study.txt:
357+
$(RUN) python src/scripts/report_biosamples_per_study.py > $@
423358

424-
create-load-study-ttl-files: local/study-files/nmdc-sty-11-8fb6t785.ttl \
425-
local/study-files/nmdc-sty-11-1t150432.ttl \
426-
local/study-files/nmdc-sty-11-zs2syx06.ttl
359+
## getting a report of GOLD study identifiers, which might have been used a Study ids in legacy (pre-Napa) data
360+
local/gold-study-ids.json:
361+
curl -X 'GET' \
362+
--output $@ \
363+
'https://api-napa.microbiomedata.org/nmdcschema/study_set?max_page_size=999&projection=id%2Cgold_study_identifiers' \
364+
-H 'accept: application/json'
427365

366+
local/gold-study-ids.yaml: local/gold-study-ids.json
367+
yq -p json -o yaml $< > $@
428368

429-
# seems to work in both the datasets offline and active states
430-
# could also show how to submit to fuseki via curl
431-
# or could run interactively in Fuseki web UI, localhost:3030
432-
# but that may only load in a private browser window
433-
local/subjects-lacking-rdf-types.tsv:
434-
tdb2.tdbquery \
435-
--loc=$(FD_ROOT)/nmdc-tdb2 \
436-
--query=assets/sparql/subjects-lacking-rdf-types.rq \
437-
--results=TSV > $@ # this doesn't take into consideration that some entities have nmdc:type string values, which should be migrated
369+
local/study-files/%.yaml: local/nmdc-schema-v7.8.0.yaml
370+
mkdir -p $(@D)
371+
study_file_name=`echo $@` ; \
372+
echo $$study_file_name ; \
373+
study_id=`poetry run get-study-id-from-filename $$study_file_name` ; \
374+
echo $$study_id ; \
375+
date ; \
376+
time $(RUN) get-study-related-records \
377+
--api-base-url https://api-napa.microbiomedata.org \
378+
extract-study \
379+
--study-id $$study_id \
380+
--output-file $@.tmp.yaml
381+
sed -i.bak 's/gold:/GOLD:/' $@.tmp.yaml # kludge modify data to match (old!) schema
382+
rm -rf $@.tmp.bak
383+
- $(RUN) linkml-validate --schema $< $@.tmp.yaml > $@.validation.log.txt
384+
time $(RUN) migration-recursion \
385+
--schema-path $< \
386+
--input-path $@.tmp.yaml \
387+
--salvage-prefix generic \
388+
--output-path $@ # kludge masks ids that contain whitespace
389+
rm -rf $@.tmp.yaml $@.tmp.yaml.bak
390+
391+
.PHONY: create-study-yaml-files-subset create-study-ttl-files-subset load-from-some-napa-collections
392+
393+
create-study-yaml-files-subset: local/study-files/nmdc-sty-11-8fb6t785.yaml \
394+
local/study-files/nmdc-sty-11-1t150432.yaml \
395+
local/study-files/nmdc-sty-11-dcqce727.yaml
438396

397+
local/study-files/%.ttl: local/nmdc-schema-v7.8.0.yaml create-nmdc-tdb2-from-app create-study-yaml-files-subset
398+
$(RUN) linkml-convert --output $@ --schema $< $(subst .ttl,.yaml,$@)
439399

440-
# retreive, validate, convert, repair, load and query selected colelctiosn form the Napa squad's MongoDB
400+
create-study-ttl-files-subset: local/study-files/nmdc-sty-11-8fb6t785.ttl \
401+
local/study-files/nmdc-sty-11-1t150432.ttl \
402+
local/study-files/nmdc-sty-11-dcqce727.ttl
441403

404+
## Option 2 of 2 for getting data from MongoDB for Napa QC: get-study-id-from-filename
405+
# retrieve selected collections from the Napa squad's MongoDB and fix ids containing whitespace
442406
local/some_napa_collections.yaml: local/nmdc-schema-v7.8.0.yaml
443407
date
444408
time $(RUN) pure-export \
@@ -491,8 +455,6 @@ load-from-some-napa-collections: local/some_napa_collections.ttl
491455
--data-binary @$< http://fuseki:3030/nmdc-tdb2/data?graph=https://api-napa.microbiomedata.org
492456

493457
.PHONY: load-non-native-uri-schema
494-
# from linkml/linkml branch issue-1842
495-
# poetry run gen-owl --no-use-native-uris ../nmdc-schema/src/schema/nmdc.yaml > ../nmdc-schema/local/nmdc_with_non_native_uris.owl.ttl
496458
load-non-native-uri-schema: local/nmdc-schema-v7.8.0.owl.ttl create-nmdc-tdb2-from-app
497459
curl -X \
498460
POST -H "Content-Type: text/turtle" \
@@ -512,11 +474,40 @@ some-napa-collections-cleanup:
512474
rm -rf local/some_napa_collections*
513475
rm -rf local/nmdc-schema*
514476

515-
516477
.PHONY: clear-data-graph some-napa-collections-cleanup
517478
clear-data-graph:
518479
curl -X \
519480
POST -H "Content-Type: application/sparql-update" \
520481
--user 'admin:password' \
521482
--data "CLEAR GRAPH <https://api-napa.microbiomedata.org>" \
522-
http://fuseki:3030/nmdc-tdb2/update
483+
http://fuseki:3030/nmdc-tdb2/update
484+
485+
.PHONY: docker-compose-down-from-host
486+
docker-compose-down-from-host:
487+
docker compose down
488+
489+
## Querying with Fuseki's SAPRQL API is preferred. Here's an example of querying TDB2 database directly.
490+
## We haven't whetehr the direct query is appropriate or preferable in any cases
491+
# or could run interactively in Fuseki web UI, localhost:3030
492+
# but that may only load in a private browser window
493+
494+
#local/nmdc-tdb2-graph-list.tsv:
495+
# tdb2.tdbquery \
496+
# --loc=$(FD_ROOT)/nmdc-tdb2 \
497+
# --query=assets/sparql/tdb-graph-list.rq \
498+
# --results=TSV > $@
499+
500+
#local/subjects-lacking-rdf-types.tsv:
501+
# tdb2.tdbquery \
502+
# --loc=$(FD_ROOT)/nmdc-tdb2 \
503+
# --query=assets/sparql/subjects-lacking-rdf-types.rq \
504+
# --results=TSV > $@ # this doesn't take into consideration that some entities have nmdc:type string values, which should be migrated
505+
506+
## when would we want to delete instead of clearing?
507+
# curl -X DELETE \
508+
# --user 'admin:password' \
509+
# http://fuseki:3030/nmdc-tdb2/data?default
510+
511+
# curl -X DELETE \
512+
# --user 'admin:password' \
513+
# http://fuseki:3030/nmdc-tdb2/data?graph=https://w3id.org/nmdc/nmdc

src/scripts/report_biosamples_per_study.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# THIS IS ALL IRRELEVANT FOR NOW SINCE THE AGGREGATION QUERY IS NOT YET AVAILABLE ON THE NAPA API
22

33
# visit https://api-dev.microbiomedata.org/docs
4-
# todo: when will aggregation query running be avaialble on the production or napa APIs?
4+
# todo: when will aggregation query running be available on the production or napa APIs?
55
# click authorize
66
# for now, enter username and password in the OAuth2PasswordOrClientCredentialsBearer (OAuth2, password) form
77
# some additional authentication methods are already available and still more are to be added
@@ -67,13 +67,5 @@
6767

6868
print(f"{len(sorted_study_count)} studies with at least one Biosample part")
6969

70-
# Extract suffix portion from each key and join them into a string
71-
suffix_list = ' '.join(key.split(':')[1] for key in study_count.keys())
72-
73-
print("Desired output:", suffix_list)
74-
75-
print("HELLO")
76-
77-
# for current_key in study_count.keys():
78-
# print(current_key)
79-
# print(current_key.split(':')[1])
70+
# # Extract suffix portion from each key and join them into a string
71+
# suffix_list = ' '.join(key.split(':')[1] for key in study_count.keys())

0 commit comments

Comments
 (0)