@@ -95,14 +95,14 @@ local/usage_template.tsv: nmdc_schema/nmdc_materialized_patterns.yaml # replaces
9595 --report-style exhaustive
9696
9797examples/output/Biosample-exhaustive_report.yaml : src/data/valid/Biosample-exhasutive.yaml # replaces misspelled Biosample-exhasutive_report target
98- poetry run exhaustion-check \
98+ $( RUN ) exhaustion-check \
9999 --class-name Biosample \
100100 --instance-yaml-file $< \
101101 --output-yaml-file $@ \
102102 --schema-path src/schema/nmdc.yaml
103103
104104examples/output/Biosample-exhasutive-pretty-sorted.yaml : src/data/valid/Biosample-exhasutive.yaml
105- poetry run pretty-sort-yaml \
105+ $( RUN ) pretty-sort-yaml \
106106 -i $< \
107107 -o $@
108108
@@ -259,12 +259,6 @@ local/biosample-slot-range-type-report.tsv: src/schema/nmdc.yaml
259259 --schema-class Biosample
260260
261261# ## example of preparing to validate napa squad data
262- local/nmdc-schema-v8.0.0.yaml :
263- curl -o $@ https://raw.githubusercontent.com/microbiomedata/nmdc-schema/v8.0.0/nmdc_schema/nmdc_materialized_patterns.yaml
264- # need to remove lines like this (see_alsos whose values aren't legitimate URIs)
265- # see_also:
266- # - MIxS:experimental_factor|additional_info
267- yq eval-all -i ' del(select(fileIndex == 0) | .. | select(has("see_also")) | .see_also)' $@
268262
269263local/nmdc-schema-v7.8.0.yaml :
270264 curl -o $@ https://raw.githubusercontent.com/microbiomedata/nmdc-schema/v7.8.0/nmdc_schema/nmdc_materialized_patterns.yaml
@@ -276,169 +270,139 @@ local/nmdc-schema-v7.8.0.yaml:
276270 yq -i ' del(.classes.DataObject.slot_usage.id.pattern)' $@ # kludge modify schema to match data
277271 rm -rf $@ .bak
278272
279- local/nmdc-schema-v8.0.0.owl.ttl : local/nmdc-schema-v8.0.0.yaml
280- $(RUN ) gen-owl --no-use-native-uris $< > $@
281-
282-
283273local/nmdc-schema-v7.8.0.owl.ttl : local/nmdc-schema-v7.8.0.yaml
284274 $(RUN ) gen-owl --no-use-native-uris $< > $@
285275
286- local/nmdc-sty-11-aygzgv51.yaml :
287- $(RUN ) get-study-related-records \
288- --api-base-url https://api-napa.microbiomedata.org \
289- extract-study \
290- --study-id $(subst nmdc-,nmdc:,$(basename $(notdir $@ ) ) ) \
291- --search-orphaned-data-objects \
292- --output-file $@
293276
294-
295- # ## FUSEKI, DOCKER, ETC
277+ # # FUSEKI, DOCKER, ETC
296278# we use Apache's Jena RDF/SPARQL framework
297279# Jena provides command line tools for accessing RDF *files*
298- # or you can use a TDB as the data backend
299- # Fuseki is a web interface for submitting SPARQL queries
300- # we are foing all of this in docker so you don't have to install any of this software
280+ # we use a Jena TDB2 database as the backend (as opposed to operating over files)
281+ # Fuseki is Jena's web interface for submitting SPARQL queries
282+ # we are doing all of this in docker so you don't have to install any of this system software
301283
302- .PHONY : thorough-docker-fuseki-cleanup-from-host # this level of cleanup may not be needed ona regular basis
284+ .PHONY : thorough-docker-fuseki-cleanup-from-host # this level of cleanup may not be needed on a regular basis
303285thorough-docker-fuseki-cleanup-from-host : some-napa-collections-cleanup
304286 - docker compose down
305287 rm -rf local/fuseki-data
306- rm -rf local/nmdc-data*
307- rm -rf local/nmdc-tdb2*
308288 rm -rf local/sparql-results/*
309289 rm -rf .venv
310290 docker system prune --force # very aggressive. may delete containers etc that you want but are not currently running
311291
312292.PHONY : docker-startup-from-host
313293docker-startup-from-host :
314294 docker compose up --build --detach # --build is only necessary if changes have been made to the Dockerfile
315- docker-compose run app poetry install
316- docker-compose run app create-nmdc-tdb2-from-app
317295
318- # manually: `docker compose exec app bash`
319- # then you can do any nmdc-schem makefile commands in the 'app' environment
296+ # from host: checkout the desired branch, fetch and pull
297+ # from host: `docker compose exec app bash`
298+ # it's best if there isn't already a ./.venv, especially if the environment wasn't built for Linux
299+ # in container: `poetry install`
300+ # in container: `make build-schema-in-app`
301+
302+ # then you can do any nmdc-schema makefile commands in the 'app' environment
320303
321304.PHONY : build-schema-in-app
322- build-schema-in-app :
323- # # Warning: 'get-study-related-records' is an entry point defined in pyproject.toml, but it's not installed as a script. You may get improper `sys.argv[0]`.
324- # # The support to run uninstalled scripts will be removed in a future release.
325- # # Run `poetry install` to resolve and get rid of this message.
326- # poetry install # it's best if there isn't already a ./.venv, especially if it's not for Linux
305+ build-schema-in-app : pre-build
327306 make squeaky-clean all test
328307
308+ .PHONY : pre-build
309+ pre-build : local/gold-study-ids.yaml create-nmdc-tdb2-from-app
310+
329311.PHONY : create-nmdc-tdb2-from-app
330312create-nmdc-tdb2-from-app : # Fuseki will get it's data from this TDB2 database. It starts out empty.
331313 curl \
332314 --user ' admin:password' \
333315 --data ' dbType=tdb&dbName=nmdc-tdb2' \
334316 ' http://fuseki:3030/$$/datasets'
335317
336- # # does this with work in both the datasets offline and active states?
337- # local/nmdc-tdb2-graph-list.tsv:
338- # tdb2.tdbquery \
339- # --loc=$(FD_ROOT)/nmdc-tdb2 \
340- # --query=assets/sparql/tdb-graph-list.rq \
341- # --results=TSV > $@
342318
343- # curl -X DELETE \
344- # --user 'admin:password' \
345- # http://fuseki:3030/nmdc-tdb2/data?default
319+ # # Option 2 of 2 for getting data from MongoDB for Napa QC: get-study-id-from-filename
320+ #
321+ # advantage: can retreive records that have known, chacterized paths to a Study (useful scoping)
322+ # disadvantages:
323+ # some paths to records form some collections aren't implemented yet
324+ # runs slow on Mark's computers in Philadelphia. Running pure-export can reteive more data more quickly, but without any scoping
325+ # the core targets include wildcards in their names,
326+ # but an individual YAML file can be built like this: make local/study-files/nmdc-sty-11-8fb6t785.yaml
327+ # or an explicit subset of YAML files can be built with: make create-study-yaml-files-subset
328+ # or YAML files can be built from a list of study ids (STUDY_IDS) like this: make create-study-yaml-files-from-study-ids-list
346329
347- # curl -X DELETE \
348- # --user 'admin:password' \
349- # http://fuseki:3030/nmdc-tdb2/data?graph=https://w3id.org/nmdc/nmdc
350330
331+ # can't ever be used without generating local/gold-study-ids.yaml first
332+ STUDY_IDS := $(shell yq '.resources.[].id' local/gold-study-ids.yaml | awk '{printf "% s ", $$0} END {print ""}')
351333
352- .PHONY : docker-compose-down-from-host
353- docker-compose-down-from-host :
354- docker compose down
334+ .PHONY : print-discovered-study-ids print-intended-yaml-files
355335
356- # ----
336+ # can't ever be used without generating local/gold-study-ids.yaml first
337+ print-discovered-study-ids :
338+ @echo $(STUDY_IDS )
357339
358- .PHONY : print-prefixed-study-ids print-file-list
340+ # Replace colons with hyphens in study IDs
341+ # can't ever be used without generating local/gold-study-ids.yaml first
342+ STUDY_YAML_FILES := $(addsuffix .yaml,$(addprefix local/study-files/,$(subst :,-,$(STUDY_IDS ) ) ) )
359343
360- STUDY_IDS := nmdc:sty-11-34xj1150 nmdc:sty-11-5bgrvr62 nmdc:sty-11-5tgfr349 nmdc:sty-11-r2h77870 \
361- nmdc:sty-11-db67n062 nmdc:sty-11-8xdqsn54 nmdc:sty-11-28tm5d36 nmdc:sty-11-33fbta56 nmdc:sty-11-076c9980 \
362- nmdc:sty-11-t91cwb40 nmdc:sty-11-aygzgv51 nmdc:sty-11-547rwq94 nmdc:sty-11-zs2syx06 nmdc:sty-11-dcqce727 \
363- nmdc:sty-11-1t150432 nmdc:sty-11-8fb6t785
344+ .PHONY : all-study-yaml-files
364345
365- # print-prefixed- study-ids:
366- # @echo $(STUDY_IDS )
346+ # can't ever be used without generating local/gold- study-ids.yaml first
347+ create-study-yaml-files-from-study-ids-list : $(STUDY_YAML_FILES )
367348
368- # Replace colons with hyphens in study IDs
369- STUDY_FILES := $(addsuffix .yaml,$(addprefix local/study-files/,$(subst :,-,$(STUDY_IDS ) ) ) )
370-
371- # print-file-list:
372- # @echo $(STUDY_FILES)
373-
374- # Napa nmdc:sty-11-aygzgv51 = "production" gold:Gs0114663
375-
376- # [('nmdc:sty-11-34xj1150', 4443),
377- # ('nmdc:sty-11-5bgrvr62', 471),
378- # ('nmdc:sty-11-5tgfr349', 430),
379- # ('nmdc:sty-11-r2h77870', 416),
380- # ('nmdc:sty-11-db67n062', 241),
381- # ('nmdc:sty-11-8xdqsn54', 207),
382- # ('nmdc:sty-11-28tm5d36', 134),
383- # ('nmdc:sty-11-33fbta56', 124),
384- # ('nmdc:sty-11-076c9980', 105),
385- # ('nmdc:sty-11-t91cwb40', 95),
386- # ('nmdc:sty-11-aygzgv51', 85),
387- # ('nmdc:sty-11-547rwq94', 80),
388- # ('nmdc:sty-11-zs2syx06', 60), # Extracted study nmdc:sty-11-zs2syx06 from the NMDC database in 0:00:01.475736.
389- # ('nmdc:sty-11-dcqce727', 53), # Extracted study nmdc:sty-11-dcqce727 from the NMDC database in 0:36:39.633116.
390- # ('nmdc:sty-11-1t150432', 30), # Extracted study nmdc:sty-11-8fb6t785 from the NMDC database in 0:01:04.012420, 0:01:17.337886.
391- # ('nmdc:sty-11-8fb6t785', 23)] # Extracted study nmdc:sty-11-8fb6t785 from the NMDC database in 0:01:01.963206.
392-
393- # local/study-files/nmdc-sty-11-34xj1150.yaml
394- local/study-files/% .yaml : local/nmdc-schema-v8.0.0.yaml
395- mkdir -p $(@D )
396- rm -rf study-file-name.txt study-id.txt
397- echo $@ > study-file-name.txt
398- echo $(shell poetry run get-study-id-from-filename $$(<study-file-name.txt ) ) > study-id.txt
399- # cumbersome! using python script because can't replace just first hyphen with colon with make text function
400- # then, can't use $@ inside of a make shell call
401- # we just want to transform $@, like local/study-files/nmdc-sty-11-8fb6t785.yaml to nmdc:sty-11-8fb6t785
402- date
403- time $(RUN ) get-study-related-records \
404- --api-base-url https://api-napa.microbiomedata.org \
405- extract-study \
406- --study-id $$(<study-id.txt ) \
407- --output-file $@
408- $(RUN ) linkml-validate --schema $< $@ > $@ .validation.log.txt
409- rm -rf study-file-name.txt study-id.txt
410-
411- create-study-yaml-files : local/study-files/nmdc-sty-11-8fb6t785.yaml \
412- local/study-files/nmdc-sty-11-1t150432.yaml \
413- local/study-files/nmdc-sty-11-zs2syx06.yaml
349+ # can't ever be used without generating local/gold-study-ids.yaml first
350+ print-intended-yaml-files : local/gold-study-ids.yaml
351+ @echo $(STUDY_YAML_FILES )
414352
415- # includes load into fuseki
416- # not doing any migration here yet
417- local/study-files/% .ttl : local/nmdc-schema-v8.0.0.yaml create-nmdc-tdb2-from-app create-study-yaml-files
418- $(RUN ) linkml-convert --output $@ --schema $< $(subst .ttl,.yaml,$@ )
419- curl -X \
420- POST -H " Content-Type: text/turtle" \
421- --user ' admin:password' \
422- --data-binary @$@ http://fuseki:3030/nmdc-tdb2/data? graph=https://api-napa.microbiomedata.org
353+ # # we can get a report of biosamples per study with the following
354+ # # may help predict how long it will take to run study-id-from-filename on a particular study
355+ # # will become unnecessary once aggregation queries are available in the napa nmdc-runtime API
356+ local/biosamples-per-study.txt :
357+ $(RUN ) python src/scripts/report_biosamples_per_study.py > $@
423358
424- create-load-study-ttl-files : local/study-files/nmdc-sty-11-8fb6t785.ttl \
425- local/study-files/nmdc-sty-11-1t150432.ttl \
426- local/study-files/nmdc-sty-11-zs2syx06.ttl
359+ # # getting a report of GOLD study identifiers, which might have been used a Study ids in legacy (pre-Napa) data
360+ local/gold-study-ids.json :
361+ curl -X ' GET' \
362+ --output $@ \
363+ ' https://api-napa.microbiomedata.org/nmdcschema/study_set?max_page_size=999&projection=id%2Cgold_study_identifiers' \
364+ -H ' accept: application/json'
427365
366+ local/gold-study-ids.yaml : local/gold-study-ids.json
367+ yq -p json -o yaml $< > $@
428368
429- # seems to work in both the datasets offline and active states
430- # could also show how to submit to fuseki via curl
431- # or could run interactively in Fuseki web UI, localhost:3030
432- # but that may only load in a private browser window
433- local/subjects-lacking-rdf-types.tsv :
434- tdb2.tdbquery \
435- --loc=$(FD_ROOT ) /nmdc-tdb2 \
436- --query=assets/sparql/subjects-lacking-rdf-types.rq \
437- --results=TSV > $@ # this doesn't take into consideration that some entities have nmdc:type string values, which should be migrated
369+ local/study-files/% .yaml : local/nmdc-schema-v7.8.0.yaml
370+ mkdir -p $(@D )
371+ study_file_name=` echo $@ ` ; \
372+ echo $$ study_file_name ; \
373+ study_id=` poetry run get-study-id-from-filename $$ study_file_name` ; \
374+ echo $$ study_id ; \
375+ date ; \
376+ time $(RUN ) get-study-related-records \
377+ --api-base-url https://api-napa.microbiomedata.org \
378+ extract-study \
379+ --study-id $$ study_id \
380+ --output-file $@ .tmp.yaml
381+ sed -i.bak ' s/gold:/GOLD:/' $@ .tmp.yaml # kludge modify data to match (old!) schema
382+ rm -rf $@ .tmp.bak
383+ - $(RUN ) linkml-validate --schema $< $@ .tmp.yaml > $@ .validation.log.txt
384+ time $(RUN ) migration-recursion \
385+ --schema-path $< \
386+ --input-path $@ .tmp.yaml \
387+ --salvage-prefix generic \
388+ --output-path $@ # kludge masks ids that contain whitespace
389+ rm -rf $@ .tmp.yaml $@ .tmp.yaml.bak
390+
391+ .PHONY : create-study-yaml-files-subset create-study-ttl-files-subset load-from-some-napa-collections
392+
393+ create-study-yaml-files-subset : local/study-files/nmdc-sty-11-8fb6t785.yaml \
394+ local/study-files/nmdc-sty-11-1t150432.yaml \
395+ local/study-files/nmdc-sty-11-dcqce727.yaml
438396
397+ local/study-files/% .ttl : local/nmdc-schema-v7.8.0.yaml create-nmdc-tdb2-from-app create-study-yaml-files-subset
398+ $(RUN ) linkml-convert --output $@ --schema $< $(subst .ttl,.yaml,$@ )
439399
440- # retreive, validate, convert, repair, load and query selected colelctiosn form the Napa squad's MongoDB
400+ create-study-ttl-files-subset : local/study-files/nmdc-sty-11-8fb6t785.ttl \
401+ local/study-files/nmdc-sty-11-1t150432.ttl \
402+ local/study-files/nmdc-sty-11-dcqce727.ttl
441403
404+ # # Option 2 of 2 for getting data from MongoDB for Napa QC: get-study-id-from-filename
405+ # retrieve selected collections from the Napa squad's MongoDB and fix ids containing whitespace
442406local/some_napa_collections.yaml : local/nmdc-schema-v7.8.0.yaml
443407 date
444408 time $(RUN ) pure-export \
@@ -491,8 +455,6 @@ load-from-some-napa-collections: local/some_napa_collections.ttl
491455 --data-binary @$< http://fuseki:3030/nmdc-tdb2/data? graph=https://api-napa.microbiomedata.org
492456
493457.PHONY : load-non-native-uri-schema
494- # from linkml/linkml branch issue-1842
495- # poetry run gen-owl --no-use-native-uris ../nmdc-schema/src/schema/nmdc.yaml > ../nmdc-schema/local/nmdc_with_non_native_uris.owl.ttl
496458load-non-native-uri-schema : local/nmdc-schema-v7.8.0.owl.ttl create-nmdc-tdb2-from-app
497459 curl -X \
498460 POST -H " Content-Type: text/turtle" \
@@ -512,11 +474,40 @@ some-napa-collections-cleanup:
512474 rm -rf local/some_napa_collections*
513475 rm -rf local/nmdc-schema*
514476
515-
516477.PHONY : clear-data-graph some-napa-collections-cleanup
517478clear-data-graph :
518479 curl -X \
519480 POST -H " Content-Type: application/sparql-update" \
520481 --user ' admin:password' \
521482 --data " CLEAR GRAPH <https://api-napa.microbiomedata.org>" \
522- http://fuseki:3030/nmdc-tdb2/update
483+ http://fuseki:3030/nmdc-tdb2/update
484+
485+ .PHONY : docker-compose-down-from-host
486+ docker-compose-down-from-host :
487+ docker compose down
488+
489+ # # Querying with Fuseki's SAPRQL API is preferred. Here's an example of querying TDB2 database directly.
490+ # # We haven't whetehr the direct query is appropriate or preferable in any cases
491+ # or could run interactively in Fuseki web UI, localhost:3030
492+ # but that may only load in a private browser window
493+
494+ # local/nmdc-tdb2-graph-list.tsv:
495+ # tdb2.tdbquery \
496+ # --loc=$(FD_ROOT)/nmdc-tdb2 \
497+ # --query=assets/sparql/tdb-graph-list.rq \
498+ # --results=TSV > $@
499+
500+ # local/subjects-lacking-rdf-types.tsv:
501+ # tdb2.tdbquery \
502+ # --loc=$(FD_ROOT)/nmdc-tdb2 \
503+ # --query=assets/sparql/subjects-lacking-rdf-types.rq \
504+ # --results=TSV > $@ # this doesn't take into consideration that some entities have nmdc:type string values, which should be migrated
505+
506+ # # when would we want to delete instead of clearing?
507+ # curl -X DELETE \
508+ # --user 'admin:password' \
509+ # http://fuseki:3030/nmdc-tdb2/data?default
510+
511+ # curl -X DELETE \
512+ # --user 'admin:password' \
513+ # http://fuseki:3030/nmdc-tdb2/data?graph=https://w3id.org/nmdc/nmdc
0 commit comments