neo4j · fbiville · Nov 24, 2025 · Nov 7, 2025 · Nov 7, 2025 · fbiville
@@ -28,9 +28,71 @@
 import org.neo4j.importer.v1.graph.Graphs;
 
 /**
- * Represents the entire parallelizable execution plan for an import step graph.
- * Tasks are grouped into groups, as a list of independent ImportStepGroup. Each group can be processed
- * entirely in parallel with the others.
+ * {@link ImportExecutionPlan} exposes the graph of {@link ImportStep} to execute in a way that eases import
+ * parallelization.<br><br>
+ * The first level of parallelization is {@link org.neo4j.importer.v1.pipeline.ImportExecutionPlan.ImportStepGroup},
+ * retrieved with {@link ImportExecutionPlan#getGroups()}.
+ * Each group corresponds to a weakly connected component of the import step graph.<br>
+ * For instance, the following YAML serialization of {@link org.neo4j.importer.v1.ImportSpecification} (other attributes
+ * are omitted for brevity):
+ * <pre><code>
+ * version: "1"
+ * sources:
+ *   - name: actors
+ *   - name: films
+ * targets:
+ *   nodes:
+ *     - source: actors
+ *       name: actor_nodes
+ *     - source: films
+ *       name: film_nodes
+ * </code></pre>
+ * <br>
+ * ... results into 2 groups:<br><br>
+ *  - 1 with the "actors" source and "actor_nodes" node target (converted respectively to {@link SourceStep} and
+ *  {@link NodeTargetStep})<br>
+ *  - 1 with the "films" source and "film_nodes" node target (converted respectively to {@link SourceStep} and
+ *  {@link NodeTargetStep})<br>
+ *  <br>
+ *  These groups can be processed in parallel.
+ *  The import is considered completed when every group's import has completed.<br><br>
+ *  Each {@link org.neo4j.importer.v1.pipeline.ImportExecutionPlan.ImportStepGroup} is made of several
+ *  {@link org.neo4j.importer.v1.pipeline.ImportExecutionPlan.ImportStepStage}, retrieved with
+ *  {@link ImportStepGroup#getStages()}.<br>
+ *  Stages <strong>must</strong> be processed sequentially. In other words, the second stage can not run until the first
+ *  stage has completed, and so on.<br>
+ *  <br>
+ *  Assuming the following YAML serialization of {@link org.neo4j.importer.v1.ImportSpecification} (other attributes are
+ *  omitted for brevity):
+ * <pre><code>
+ * version: "1"
+ * sources:
+ *   - name: actors
+ *   - name: films
+ *   - name: actors_in_films
+ * targets:
+ *   nodes:
+ *     - source: actors
+ *       name: actor_nodes
+ *     - source: films
+ *       name: film_nodes
+ *   relationships:
+ *     - source: actors_in_films
+ *       name: actor_film_relationships
+ *       start_node_reference: actor_nodes
+ *       end_node_reference: film_nodes
+ * </code></pre>
+ * This would result in a single {@link org.neo4j.importer.v1.pipeline.ImportExecutionPlan.ImportStepGroup}
+ * (every step is linked, directly or indirectly).
+ * The group is made of at least 3 stages:<br>
+ *  - the first stage includes all the sources<br>
+ *  - the second stage includes all the nodes<br>
+ *  - the last stage includes the relationship<br>
+ *  <br>
+ *  Finally, each stage is made of several steps.
+ *  These steps (either {@link SourceStep}, {@link NodeTargetStep}, {@link RelationshipTargetStep},
+ *  {@link CustomQueryTargetStep} or {@link ActionStep}) can be processed in parallel.<br>
+ *  The enclosing stage execution is considered complete when all its steps have completed.
  */
 public class ImportExecutionPlan {
 

@@ -47,10 +47,21 @@
 import org.neo4j.importer.v1.targets.Targets;
 
 /**
- * {@link ImportPipeline} exposes a topologically-ordered set of {@link ImportStep},
+ * {@link ImportPipeline} exposes a topologically-ordered set of active {@link ImportStep},
  * based on the provided {@link ImportSpecification}, usually created with
  * {@link org.neo4j.importer.v1.ImportSpecificationDeserializer#deserialize(Reader)} or its variants.
+ * <br>
+ * The existing types of the provided {@link ImportSpecification} are converted to their Step equivalent: {@link Source}
+ * gets translated to {@link SourceStep}, {@link NodeTarget} to {@link NodeTargetStep} etc...<br>
+ * Inactive {@link Target}s (see {@link Target#isActive}) are skipped and therefore not translated to their
+ * corresponding step type.
  * <br><br>
+ * In particular, {@link RelationshipTargetStep}s directly reference their start and end {@link NodeTargetStep}, whereas
+ * {@link RelationshipTarget}s simply reference the start and end node target name. This reduces the amount of required
+ * lookup logic. If the corresponding start and/or end {@link org.neo4j.importer.v1.targets.NodeReference} define
+ * key mapping overrides, the resulting {@link NodeTargetStep} get their property mappings accordingly updated.
+ * <br><br>
+ * Here is an example usage of {@link ImportPipeline} made possible by its {@link Iterable} implementation:
  * <pre>
  *     var specification = org.neo4j.importer.v1.ImportSpecificationDeserializer.deserialize(aReader);
  *     var pipeline = @link ImportPipeline.of(specification);
@@ -66,14 +77,30 @@
  *          }
  *     });
  * </pre>
- * <br><br>
- * Since an {@link ImportStep} may have dependencies, which are either:<br><br>
- * - implicit like a {@link TargetStep} depending on a {@link SourceStep},
- * a {@link RelationshipTargetStep} depending on start/end {@link NodeTargetStep}s<br>
- * - and/or explicit (via {@link ImportStep#dependencies()}<br><br>
- * ... the pipeline guarantees that dependencies are *always* returned after their dependents.<br>
- * In particular, the dependencies of each {@link ActionStep} are resolved at pipeline construction, based on the
- * provided import specification and the corresponding {@link Action}'s {@link ActionStage}.
+ * The iteration returns every step in order.
+ * Each step is guaranteed to be processed after all its implicit and explicit dependencies.
+ * <br>
+ * Implicit dependencies are:<br><br>
+ * - {@link TargetStep} depending on a {@link SourceStep}<br>
+ * - {@link RelationshipTargetStep} depending on start/end {@link NodeTargetStep}s<br>
+ * - {@link RelationshipTargetStep} sharing common start/end nodes with other {@link RelationshipTargetStep}<br/>
+ * - {@link ActionStep} must define an {@link ActionStage}, which gets translated to a set of concrete dependencies<br>
+ * <br>
+ * Relationships sharing common nodes must not be imported in parallel as this would likely cause
+ * deadlock issues. Such relationships are defined with an explicit dependency between them.<br>
+ *<br>
+ * {@link Action}s with {@link ActionStage#POST_QUERIES} are translated to
+ * instance of {@link ActionStep}s with dependencies on all the {@link CustomQueryTargetStep} defined in the pipeline.
+ * {@link ActionStep}s with {@link ActionStage#PRE_NODES} result in making all the {@link NodeTargetStep} in
+ * the pipeline to depend on these actions.
+ * Finally, {@link ActionStep}s with {@link ActionStage#END} get the following dependencies:<br><br>
+ *  - all declared {@link SourceStep}<br>
+ *  - all declared {@link TargetStep}<br>
+ *  - all declared {@link ActionStep} with an {@link ActionStage} different from {@link ActionStage#END}<br>
+ * <br>
+ * Dependencies can also be explicit:<br><br>
+ *  - {@link TargetStep} can define dependencies to other {@link TargetStep}<br>
+ * <br>
  */
 public class ImportPipeline implements Iterable<ImportStep>, Serializable {
 
@@ -94,6 +121,12 @@ public Iterator<ImportStep> iterator() {
         return stepGraph.keySet().iterator();
     }
 
+    /**
+     * Returns an {@link ImportExecutionPlan}, which makes it easier to write parallelizable import backends, compared
+     * to the sequential {@link Iterable} API that {@link ImportPipeline} exposes.<br>
+     * Please consult the documentation of {@link ImportExecutionPlan} for more details.
+     * @return the import execution plan
+     */
     public ImportExecutionPlan executionPlan() {
         return ImportExecutionPlan.fromGraph(stepGraph);
     }