diff --git a/core/src/main/java/org/neo4j/importer/v1/pipeline/ImportExecutionPlan.java b/core/src/main/java/org/neo4j/importer/v1/pipeline/ImportExecutionPlan.java
index 6e961e27..30cc7536 100644
--- a/core/src/main/java/org/neo4j/importer/v1/pipeline/ImportExecutionPlan.java
+++ b/core/src/main/java/org/neo4j/importer/v1/pipeline/ImportExecutionPlan.java
@@ -28,9 +28,71 @@
import org.neo4j.importer.v1.graph.Graphs;
/**
- * Represents the entire parallelizable execution plan for an import step graph.
- * Tasks are grouped into groups, as a list of independent ImportStepGroup. Each group can be processed
- * entirely in parallel with the others.
+ * {@link ImportExecutionPlan} exposes the graph of {@link ImportStep} to execute in a way that eases import
+ * parallelization.
+ * The first level of parallelization is {@link org.neo4j.importer.v1.pipeline.ImportExecutionPlan.ImportStepGroup},
+ * retrieved with {@link ImportExecutionPlan#getGroups()}.
+ * Each group corresponds to a weakly connected component of the import step graph.
+ * For instance, the following YAML serialization of {@link org.neo4j.importer.v1.ImportSpecification} (other attributes
+ * are omitted for brevity):
+ *
+ * version: "1"
+ * sources:
+ * - name: actors
+ * - name: films
+ * targets:
+ * nodes:
+ * - source: actors
+ * name: actor_nodes
+ * - source: films
+ * name: film_nodes
+ *
+ *
+ * ... results into 2 groups:
+ * - 1 with the "actors" source and "actor_nodes" node target (converted respectively to {@link SourceStep} and
+ * {@link NodeTargetStep})
+ * - 1 with the "films" source and "film_nodes" node target (converted respectively to {@link SourceStep} and
+ * {@link NodeTargetStep})
+ *
+ * These groups can be processed in parallel.
+ * The import is considered completed when every group's import has completed.
+ * Each {@link org.neo4j.importer.v1.pipeline.ImportExecutionPlan.ImportStepGroup} is made of several
+ * {@link org.neo4j.importer.v1.pipeline.ImportExecutionPlan.ImportStepStage}, retrieved with
+ * {@link ImportStepGroup#getStages()}.
+ * Stages must be processed sequentially. In other words, the second stage can not run until the first
+ * stage has completed, and so on.
+ *
+ * Assuming the following YAML serialization of {@link org.neo4j.importer.v1.ImportSpecification} (other attributes are
+ * omitted for brevity):
+ *
+ * version: "1"
+ * sources:
+ * - name: actors
+ * - name: films
+ * - name: actors_in_films
+ * targets:
+ * nodes:
+ * - source: actors
+ * name: actor_nodes
+ * - source: films
+ * name: film_nodes
+ * relationships:
+ * - source: actors_in_films
+ * name: actor_film_relationships
+ * start_node_reference: actor_nodes
+ * end_node_reference: film_nodes
+ *
+ * This would result in a single {@link org.neo4j.importer.v1.pipeline.ImportExecutionPlan.ImportStepGroup}
+ * (every step is linked, directly or indirectly).
+ * The group is made of at least 3 stages:
+ * - the first stage includes all the sources
+ * - the second stage includes all the nodes
+ * - the last stage includes the relationship
+ *
+ * Finally, each stage is made of several steps.
+ * These steps (either {@link SourceStep}, {@link NodeTargetStep}, {@link RelationshipTargetStep},
+ * {@link CustomQueryTargetStep} or {@link ActionStep}) can be processed in parallel.
+ * The enclosing stage execution is considered complete when all its steps have completed.
*/
public class ImportExecutionPlan {
diff --git a/core/src/main/java/org/neo4j/importer/v1/pipeline/ImportPipeline.java b/core/src/main/java/org/neo4j/importer/v1/pipeline/ImportPipeline.java
index 8f3c9707..fadcd849 100644
--- a/core/src/main/java/org/neo4j/importer/v1/pipeline/ImportPipeline.java
+++ b/core/src/main/java/org/neo4j/importer/v1/pipeline/ImportPipeline.java
@@ -47,10 +47,21 @@
import org.neo4j.importer.v1.targets.Targets;
/**
- * {@link ImportPipeline} exposes a topologically-ordered set of {@link ImportStep},
+ * {@link ImportPipeline} exposes a topologically-ordered set of active {@link ImportStep},
* based on the provided {@link ImportSpecification}, usually created with
* {@link org.neo4j.importer.v1.ImportSpecificationDeserializer#deserialize(Reader)} or its variants.
+ *
+ * The existing types of the provided {@link ImportSpecification} are converted to their Step equivalent: {@link Source}
+ * gets translated to {@link SourceStep}, {@link NodeTarget} to {@link NodeTargetStep} etc...
+ * Inactive {@link Target}s (see {@link Target#isActive}) are skipped and therefore not translated to their
+ * corresponding step type.
*
+ * In particular, {@link RelationshipTargetStep}s directly reference their start and end {@link NodeTargetStep}, whereas
+ * {@link RelationshipTarget}s simply reference the start and end node target name. This reduces the amount of required
+ * lookup logic. If the corresponding start and/or end {@link org.neo4j.importer.v1.targets.NodeReference} define
+ * key mapping overrides, the resulting {@link NodeTargetStep} get their property mappings accordingly updated.
+ *
+ * Here is an example usage of {@link ImportPipeline} made possible by its {@link Iterable} implementation:
*
* var specification = org.neo4j.importer.v1.ImportSpecificationDeserializer.deserialize(aReader);
* var pipeline = @link ImportPipeline.of(specification);
@@ -66,14 +77,30 @@
* }
* });
*
- *
- * Since an {@link ImportStep} may have dependencies, which are either:
- * - implicit like a {@link TargetStep} depending on a {@link SourceStep},
- * a {@link RelationshipTargetStep} depending on start/end {@link NodeTargetStep}s
- * - and/or explicit (via {@link ImportStep#dependencies()}
- * ... the pipeline guarantees that dependencies are *always* returned after their dependents.
- * In particular, the dependencies of each {@link ActionStep} are resolved at pipeline construction, based on the
- * provided import specification and the corresponding {@link Action}'s {@link ActionStage}.
+ * The iteration returns every step in order.
+ * Each step is guaranteed to be processed after all its implicit and explicit dependencies.
+ *
+ * Implicit dependencies are:
+ * - {@link TargetStep} depending on a {@link SourceStep}
+ * - {@link RelationshipTargetStep} depending on start/end {@link NodeTargetStep}s
+ * - {@link RelationshipTargetStep} sharing common start/end nodes with other {@link RelationshipTargetStep}
+ * - {@link ActionStep} must define an {@link ActionStage}, which gets translated to a set of concrete dependencies
+ *
+ * Relationships sharing common nodes must not be imported in parallel as this would likely cause
+ * deadlock issues. Such relationships are defined with an explicit dependency between them.
+ *
+ * {@link Action}s with {@link ActionStage#POST_QUERIES} are translated to
+ * instance of {@link ActionStep}s with dependencies on all the {@link CustomQueryTargetStep} defined in the pipeline.
+ * {@link ActionStep}s with {@link ActionStage#PRE_NODES} result in making all the {@link NodeTargetStep} in
+ * the pipeline to depend on these actions.
+ * Finally, {@link ActionStep}s with {@link ActionStage#END} get the following dependencies:
+ * - all declared {@link SourceStep}
+ * - all declared {@link TargetStep}
+ * - all declared {@link ActionStep} with an {@link ActionStage} different from {@link ActionStage#END}
+ *
+ * Dependencies can also be explicit:
+ * - {@link TargetStep} can define dependencies to other {@link TargetStep}
+ *
*/
public class ImportPipeline implements Iterable, Serializable {
@@ -94,6 +121,12 @@ public Iterator iterator() {
return stepGraph.keySet().iterator();
}
+ /**
+ * Returns an {@link ImportExecutionPlan}, which makes it easier to write parallelizable import backends, compared
+ * to the sequential {@link Iterable} API that {@link ImportPipeline} exposes.
+ * Please consult the documentation of {@link ImportExecutionPlan} for more details.
+ * @return the import execution plan
+ */
public ImportExecutionPlan executionPlan() {
return ImportExecutionPlan.fromGraph(stepGraph);
}