diff --git a/pom.xml b/pom.xml index e6fb7d10dc..4458343275 100644 --- a/pom.xml +++ b/pom.xml @@ -191,9 +191,9 @@ v0.0.44-dev4 v0.0.40 v0.0.44-dev4 - 1.4.4 - 1.4.4 - 1.4.4 + 1.4.5 + 1.4.5 + 1.4.5 diff --git a/vcell-apiclient/src/main/java/org/vcell/api/server/AsynchMessageManager.java b/vcell-apiclient/src/main/java/org/vcell/api/server/AsynchMessageManager.java index 20f953c1d2..4371afd32f 100644 --- a/vcell-apiclient/src/main/java/org/vcell/api/server/AsynchMessageManager.java +++ b/vcell-apiclient/src/main/java/org/vcell/api/server/AsynchMessageManager.java @@ -88,7 +88,7 @@ public AsynchMessageManager(ClientServerManager csm) { * no-op if already called */ public synchronized void startPolling() { - if (!bPoll.get()) { + if (!bPoll.get() && !clientServerManager.isVCellClientDevMain()) { bPoll.set(true); if (executorService == null) { executorService = VCellExecutorService.get(); diff --git a/vcell-apiclient/src/main/java/org/vcell/api/server/ClientServerManager.java b/vcell-apiclient/src/main/java/org/vcell/api/server/ClientServerManager.java index 4be69e0a4f..67eb1ff9fd 100644 --- a/vcell-apiclient/src/main/java/org/vcell/api/server/ClientServerManager.java +++ b/vcell-apiclient/src/main/java/org/vcell/api/server/ClientServerManager.java @@ -45,7 +45,11 @@ public class ClientServerManager implements ClientServerInterface { private final static Logger lg = LogManager.getLogger(ClientServerManager.class); private final VCellConnectionFactory vcellConnectionFactory; public final Auth0ConnectionUtils auth0ConnectionUtils; - + + public boolean isVCellClientDevMain() { + return !(vcellConnectionFactory instanceof RemoteProxyVCellConnectionFactory); + } + public interface InteractiveContextDefaultProvider { InteractiveClientServerContext getInteractiveContext(); } diff --git a/vcell-client/src/main/java/cbit/vcell/solver/ode/gui/LangevinOptionsPanel.java b/vcell-client/src/main/java/cbit/vcell/solver/ode/gui/LangevinOptionsPanel.java index e63aa01a9d..c9b59ba74b 100644 --- a/vcell-client/src/main/java/cbit/vcell/solver/ode/gui/LangevinOptionsPanel.java +++ b/vcell-client/src/main/java/cbit/vcell/solver/ode/gui/LangevinOptionsPanel.java @@ -6,8 +6,6 @@ import javax.swing.*; import javax.swing.border.Border; -import cbit.vcell.solver.NFsimSimulationOptions; -import org.vcell.solver.nfsim.gui.NFSimSimulationOptionsPanel; import org.vcell.util.gui.CollapsiblePanel; import cbit.vcell.client.PopupGenerator; @@ -24,10 +22,10 @@ public class LangevinOptionsPanel extends CollapsiblePanel { private javax.swing.JRadioButton multiRunRadioButton = null; private javax.swing.ButtonGroup buttonGroupTrials = null; - private JLabel numOfTrialsLabel = null; - private JTextField ivjJTextFieldNumOfTrials = null; - private JLabel numOfParallelLocalRuns = null; - private JTextField ivjJTextFieldNumOfParallelLocalRuns = null; + private JLabel totalNumberOfJobsLabel = null; + private JTextField totalNumberOfJobsJTextField = null; + private JLabel numberOfConcurrentJobsLabel = null; + private JTextField numberOfConcurrentJobsJTextField = null; private JTextField numPartitionsXTextField = null; private JTextField numPartitionsYTextField = null; @@ -68,21 +66,24 @@ public void actionPerformed(java.awt.event.ActionEvent e) { // setNewOptions(); // } if (e.getSource() == getTrajectoryButton()) { - getJTextFieldNumOfTrials().setEnabled(false); - getJTextFieldNumOfParallelLocalRuns().setEnabled(false); - getJTextFieldNumOfParallelLocalRuns().setText(""); - solverTaskDescription.setNumTrials(1); - getJTextFieldNumOfTrials().setText(""); + getTotalNumberOfJobsJTextField().setEnabled(false); + getNumberOfConcurrentJobsJTextField().setEnabled(false); + getNumberOfConcurrentJobsJTextField().setText(""); + getTotalNumberOfJobsJTextField().setText(""); + solverTaskDescription.getLangevinSimulationOptions().setNumberOfConcurrentJobs(1); + solverTaskDescription.getLangevinSimulationOptions().setTotalNumberOfJobs(1); } else if (e.getSource() == getMultiRunButton()) { - getJTextFieldNumOfTrials().setEnabled(true); - getJTextFieldNumOfParallelLocalRuns().setEnabled(false); - getJTextFieldNumOfParallelLocalRuns().setText(solverTaskDescription.getLangevinSimulationOptions().getNumOfParallelLocalRuns()+""); - int numTrials = solverTaskDescription.getNumTrials(); - if(numTrials > 1) { // a multi-trial number is already set - getJTextFieldNumOfTrials().setText(numTrials+""); + getTotalNumberOfJobsJTextField().setEnabled(true); + getNumberOfConcurrentJobsJTextField().setEnabled(true); + int totalNumberOfJobs = solverTaskDescription.getLangevinSimulationOptions().getTotalNumberOfJobs(); + if(totalNumberOfJobs > 1) { // a multi-trial number is already set + getTotalNumberOfJobsJTextField().setText(totalNumberOfJobs+""); + getNumberOfConcurrentJobsJTextField().setText(solverTaskDescription.getLangevinSimulationOptions().getNumberOfConcurrentJobs()+""); } else { - solverTaskDescription.setNumTrials(SolverTaskDescription.DefaultNumTrials); - getJTextFieldNumOfTrials().setText(SolverTaskDescription.DefaultNumTrials+""); + solverTaskDescription.getLangevinSimulationOptions().setTotalNumberOfJobs(LangevinSimulationOptions.DefaultTotalNumberOfJobs); + solverTaskDescription.getLangevinSimulationOptions().setNumberOfConcurrentJobs(LangevinSimulationOptions.DefaultNumberOfConcurrentJobs); + getTotalNumberOfJobsJTextField().setText(solverTaskDescription.getLangevinSimulationOptions().getTotalNumberOfJobs()+""); + getNumberOfConcurrentJobsJTextField().setText(solverTaskDescription.getLangevinSimulationOptions().getNumberOfConcurrentJobs()+""); } } else if(e.getSource() == randomSeedCheckBox) { randomSeedTextField.setEditable(randomSeedCheckBox.isSelected()); @@ -113,29 +114,43 @@ public void focusLost(java.awt.event.FocusEvent e) { if (e.isTemporary()) { return; } - if (e.getSource() == getJTextFieldNumOfParallelLocalRuns() || - e.getSource() == getJTextFieldIntervalImage() || + if (e.getSource() == getJTextFieldIntervalImage() || e.getSource() == getJTextFieldIntervalSpring() || e.getSource() == getNumPartitionsXTextField() || e.getSource() == getNumPartitionsYTextField() || e.getSource() == getNumPartitionsXTextField() || e.getSource() == getRandomSeedTextField() ) { setNewOptions(); - } else if(e.getSource() == getJTextFieldNumOfTrials()) { - int numTrials; + } else if(e.getSource() == getTotalNumberOfJobsJTextField()) { + int totalNumberOfJobs; try { - numTrials = Integer.parseInt(getJTextFieldNumOfTrials().getText()); - if(numTrials < 2) { - numTrials = SolverTaskDescription.DefaultNumTrials; + totalNumberOfJobs = Integer.parseInt(getTotalNumberOfJobsJTextField().getText()); + if(totalNumberOfJobs < 2) { + totalNumberOfJobs = LangevinSimulationOptions.DefaultTotalNumberOfJobs; } } catch(NumberFormatException ex) { - numTrials = solverTaskDescription.getNumTrials(); - if(numTrials < 2) { - numTrials = SolverTaskDescription.DefaultNumTrials; + totalNumberOfJobs = solverTaskDescription.getLangevinSimulationOptions().getTotalNumberOfJobs(); + if(totalNumberOfJobs < 2) { + totalNumberOfJobs = LangevinSimulationOptions.DefaultTotalNumberOfJobs; } } - solverTaskDescription.setNumTrials(numTrials); - getJTextFieldNumOfTrials().setText(numTrials+""); + solverTaskDescription.getLangevinSimulationOptions().setTotalNumberOfJobs(totalNumberOfJobs); + getTotalNumberOfJobsJTextField().setText(totalNumberOfJobs+""); + } else if(e.getSource() == getNumberOfConcurrentJobsJTextField()) { + int numberOfConcurrentJobs; + try { + numberOfConcurrentJobs = Integer.parseInt(getNumberOfConcurrentJobsJTextField().getText()); + if(numberOfConcurrentJobs < 2) { + numberOfConcurrentJobs = LangevinSimulationOptions.DefaultNumberOfConcurrentJobs; + } + } catch(NumberFormatException ex) { + numberOfConcurrentJobs = solverTaskDescription.getLangevinSimulationOptions().getNumberOfConcurrentJobs(); + if(numberOfConcurrentJobs < 2) { + numberOfConcurrentJobs = LangevinSimulationOptions.DefaultNumberOfConcurrentJobs; + } + } + solverTaskDescription.getLangevinSimulationOptions().setNumberOfConcurrentJobs(numberOfConcurrentJobs); + getNumberOfConcurrentJobsJTextField().setText(numberOfConcurrentJobs+""); } } } @@ -164,8 +179,8 @@ private void initialize() { gbc.gridx = 0; gbc.gridy = 0; gbc.fill = GridBagConstraints.BOTH; -// gbc.weightx = 1.0; -// gbc.weighty = 1.0; + gbc.weightx = 1.0; + gbc.weighty = 1.0; gbc.insets = new Insets(1,1,1,1); getContentPanel().add(trialPanel, gbc); @@ -173,8 +188,8 @@ private void initialize() { gbc.gridx = 1; gbc.gridy = 0; gbc.fill = GridBagConstraints.BOTH; -// gbc.weightx = 1.0; -// gbc.weighty = 1.0; + gbc.weightx = 1.0; + gbc.weighty = 1.0; gbc.insets = new Insets(1,1,1,1); getContentPanel().add(centerPanel, gbc); @@ -182,7 +197,7 @@ private void initialize() { gbc.gridx = 2; gbc.gridy = 0; gbc.fill = GridBagConstraints.BOTH; - gbc.weightx = 1.0; + gbc.weightx = 0.0; gbc.weighty = 1.0; gbc.insets = new Insets(1,1,1,1); getContentPanel().add(rightPanel, gbc); @@ -221,7 +236,7 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,22,1,6); - trialPanel.add(getNumOfTrialsLabel(), gbc); + trialPanel.add(getTotalNumberOfJobsLabel(), gbc); gbc = new GridBagConstraints(); gbc.gridx = 2; @@ -229,7 +244,8 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,5,3,1); - trialPanel.add(getJTextFieldNumOfTrials(), gbc); + gbc.weightx = 1.0; + trialPanel.add(getTotalNumberOfJobsJTextField(), gbc); gbc = new GridBagConstraints(); gbc.gridx = 1; @@ -237,7 +253,7 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,22,1,6); - trialPanel.add(getNumOfParallelLocalRunsLabel(), gbc); + trialPanel.add(getNumberOfConcurrentJobsLabel(), gbc); gbc = new GridBagConstraints(); gbc.gridx = 2; @@ -245,7 +261,8 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,5,3,1); - trialPanel.add(getJTextFieldNumOfParallelLocalRuns(), gbc); + gbc.weightx = 1.0; + trialPanel.add(getNumberOfConcurrentJobsJTextField(), gbc); gbc = new GridBagConstraints(); gbc.gridx = 0; @@ -269,6 +286,7 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(5,5,3,1); + gbc.weightx = 1.0; trialPanel.add(getNumPartitionsXTextField(), gbc); gbc = new GridBagConstraints(); @@ -285,6 +303,7 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,5,3,1); + gbc.weightx = 1.0; trialPanel.add(getNumPartitionsYTextField(), gbc); gbc = new GridBagConstraints(); @@ -301,16 +320,9 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,5,3,1); + gbc.weightx = 1.0; trialPanel.add(getNumPartitionsZTextField(), gbc); -// gbc = new GridBagConstraints(); // --- empty panel (filler) -// gbc.gridx = 3; -// gbc.gridy = 1; -// gbc.anchor = GridBagConstraints.EAST; -// gbc.fill = GridBagConstraints.HORIZONTAL; -// gbc.weightx = 1.0; -// trialPanel.add(new JLabel(""), gbc); - // ----- centerPanel ----------------------------------------------------- gbc = new GridBagConstraints(); gbc.gridx = 0; @@ -318,6 +330,7 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,26,1,5); + gbc.weightx = 0.0; centerPanel.add(new JLabel("Spring Interval"), gbc); gbc = new GridBagConstraints(); @@ -326,13 +339,15 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,5,3,1); + gbc.weightx = 1.0; centerPanel.add(getJTextFieldIntervalSpring(), gbc); gbc = new GridBagConstraints(); gbc.gridx = 2; gbc.gridy = 0; gbc.anchor = GridBagConstraints.WEST; - gbc.insets = new Insets(0,6,1,22); + gbc.insets = new Insets(0,6,1,0); + gbc.weightx = 0.0; centerPanel.add(new JLabel("s"), gbc); gbc = new GridBagConstraints(); @@ -341,6 +356,7 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,26,1,5); + gbc.weightx = 0.0; centerPanel.add(new JLabel("Image Interval"), gbc); gbc = new GridBagConstraints(); @@ -349,13 +365,15 @@ private void initialize() { gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.HORIZONTAL; gbc.insets = new Insets(0,5,3,1); + gbc.weightx = 1.0; centerPanel.add(getJTextFieldIntervalImage(), gbc); gbc = new GridBagConstraints(); gbc.gridx = 2; gbc.gridy = 1; gbc.anchor = GridBagConstraints.WEST; - gbc.insets = new Insets(0,6,1,22); + gbc.insets = new Insets(0,6,1,0); + gbc.weightx = 0.0; centerPanel.add(new JLabel("s"), gbc); gbc = new GridBagConstraints(); // --- empty panel (filler) @@ -363,13 +381,13 @@ private void initialize() { gbc.gridy = 2; gbc.anchor = GridBagConstraints.EAST; gbc.fill = GridBagConstraints.VERTICAL; - gbc.weightx = 1.0; + gbc.weightx = 0.0; gbc.weighty = 1.0; centerPanel.add(new JLabel(""), gbc); // ----- rightPanel ---------------------------------------------------- gbc = new GridBagConstraints(); - gbc.gridx = 3; + gbc.gridx = 0; gbc.gridy = 0; gbc.anchor = GridBagConstraints.WEST; gbc.fill = GridBagConstraints.BOTH; @@ -409,8 +427,8 @@ private void initialize() { getButtonGroupTrials().add(getMultiRunButton()); getButtonGroupTrials().setSelected(getTrajectoryButton().getModel(), true); - getJTextFieldNumOfTrials().setEnabled(false); - getJTextFieldNumOfParallelLocalRuns().setEnabled(false); + getTotalNumberOfJobsJTextField().setEnabled(false); + getNumberOfConcurrentJobsJTextField().setEnabled(false); getNumPartitionsXTextField().setEnabled(true); getNumPartitionsYTextField().setEnabled(true); @@ -455,30 +473,30 @@ private javax.swing.JRadioButton getMultiRunButton() { } return multiRunRadioButton; } - private javax.swing.JTextField getJTextFieldNumOfTrials() { - if (ivjJTextFieldNumOfTrials == null) { + private javax.swing.JTextField getTotalNumberOfJobsJTextField() { + if (totalNumberOfJobsJTextField == null) { try { - ivjJTextFieldNumOfTrials = new javax.swing.JTextField(); - ivjJTextFieldNumOfTrials.setName("JTextFieldNumOfTrials"); - ivjJTextFieldNumOfTrials.setColumns(9); - ivjJTextFieldNumOfTrials.setText(""); + totalNumberOfJobsJTextField = new javax.swing.JTextField(); + totalNumberOfJobsJTextField.setName("JTextFieldNumOfTrials"); + totalNumberOfJobsJTextField.setColumns(9); + totalNumberOfJobsJTextField.setText(""); } catch (java.lang.Throwable ivjExc) { handleException(ivjExc); } } - return ivjJTextFieldNumOfTrials; + return totalNumberOfJobsJTextField; } - private javax.swing.JLabel getNumOfTrialsLabel() { - if (numOfTrialsLabel == null) { + private javax.swing.JLabel getTotalNumberOfJobsLabel() { + if (totalNumberOfJobsLabel == null) { try { - numOfTrialsLabel = new javax.swing.JLabel(); - numOfTrialsLabel.setName("NumOfTrials"); - numOfTrialsLabel.setText("Num. Of Trials"); + totalNumberOfJobsLabel = new javax.swing.JLabel(); + totalNumberOfJobsLabel.setName("TotalNumberOfJobs"); + totalNumberOfJobsLabel.setText("Total Num. Of Jobs"); } catch (java.lang.Throwable ivjExc) { handleException(ivjExc); } } - return numOfTrialsLabel; + return totalNumberOfJobsLabel; } private JCheckBox getRandomSeedCheckBox() { if(randomSeedCheckBox == null) { @@ -597,30 +615,30 @@ private javax.swing.JLabel getNumPartitionsLabel() { return numPartitionsLabel; } - private javax.swing.JTextField getJTextFieldNumOfParallelLocalRuns() { - if (ivjJTextFieldNumOfParallelLocalRuns == null) { + private javax.swing.JTextField getNumberOfConcurrentJobsJTextField() { + if (numberOfConcurrentJobsJTextField == null) { try { - ivjJTextFieldNumOfParallelLocalRuns = new javax.swing.JTextField(); - ivjJTextFieldNumOfParallelLocalRuns.setName("JTextFieldNumOfParallelLocalRuns"); - ivjJTextFieldNumOfParallelLocalRuns.setColumns(3); - ivjJTextFieldNumOfParallelLocalRuns.setText(""); + numberOfConcurrentJobsJTextField = new javax.swing.JTextField(); + numberOfConcurrentJobsJTextField.setName("NumberOfConcurrentJobsJTextField"); + numberOfConcurrentJobsJTextField.setColumns(3); + numberOfConcurrentJobsJTextField.setText(""); } catch (java.lang.Throwable ivjExc) { handleException(ivjExc); } } - return ivjJTextFieldNumOfParallelLocalRuns; + return numberOfConcurrentJobsJTextField; } - private javax.swing.JLabel getNumOfParallelLocalRunsLabel() { - if (numOfParallelLocalRuns == null) { + private javax.swing.JLabel getNumberOfConcurrentJobsLabel() { + if (numberOfConcurrentJobsLabel == null) { try { - numOfParallelLocalRuns = new javax.swing.JLabel(); - numOfParallelLocalRuns.setName("NumOfParallelLocalRuns"); - numOfParallelLocalRuns.setText("Parallel Local Runs"); + numberOfConcurrentJobsLabel = new javax.swing.JLabel(); + numberOfConcurrentJobsLabel.setName("NumberOfConcurrentJobsLabel"); + numberOfConcurrentJobsLabel.setText("Num. Concurrent Jobs"); } catch (java.lang.Throwable ivjExc) { handleException(ivjExc); } } - return numOfParallelLocalRuns; + return numberOfConcurrentJobsLabel; } private JTextField getJTextFieldIntervalImage() { @@ -681,8 +699,8 @@ private void initConnections() { getRandomSeedCheckBox().addActionListener(ivjEventHandler); getRandomSeedHelpButton().addActionListener(ivjEventHandler); - getJTextFieldNumOfTrials().addFocusListener(ivjEventHandler); - getJTextFieldNumOfParallelLocalRuns().addFocusListener(ivjEventHandler); + getTotalNumberOfJobsJTextField().addFocusListener(ivjEventHandler); + getNumberOfConcurrentJobsJTextField().addFocusListener(ivjEventHandler); getJTextFieldIntervalImage().addFocusListener(ivjEventHandler); getJTextFieldIntervalSpring().addFocusListener(ivjEventHandler); getNumPartitionsXTextField().addFocusListener(ivjEventHandler); @@ -704,20 +722,20 @@ private void refresh() { getMultiRunButton().setEnabled(true); LangevinSimulationOptions lso = solverTaskDescription.getLangevinSimulationOptions(); - int numTrials = solverTaskDescription.getNumTrials(); - int numOfParallelLocalRuns = lso.getNumOfParallelLocalRuns(); - if(numTrials == 1) { + int totalNumberOfJobs = lso.getTotalNumberOfJobs(); + int numberOfConcurrentJobs = lso.getNumberOfConcurrentJobs(); + if(totalNumberOfJobs == 1) { getTrajectoryButton().setSelected(true); - getJTextFieldNumOfTrials().setEnabled(false); - getJTextFieldNumOfTrials().setText(""); - getJTextFieldNumOfParallelLocalRuns().setEnabled(false); - getJTextFieldNumOfParallelLocalRuns().setText(""); + getTotalNumberOfJobsJTextField().setEnabled(false); + getTotalNumberOfJobsJTextField().setText(""); + getNumberOfConcurrentJobsJTextField().setEnabled(false); + getNumberOfConcurrentJobsJTextField().setText(""); } else { getMultiRunButton().setSelected(true); - getJTextFieldNumOfTrials().setEnabled(true); - getJTextFieldNumOfTrials().setText(numTrials+""); - getJTextFieldNumOfParallelLocalRuns().setEnabled(false); - getJTextFieldNumOfParallelLocalRuns().setText(numOfParallelLocalRuns + ""); + getTotalNumberOfJobsJTextField().setEnabled(true); + getTotalNumberOfJobsJTextField().setText(totalNumberOfJobs+""); + getNumberOfConcurrentJobsJTextField().setEnabled(true); + getNumberOfConcurrentJobsJTextField().setText(numberOfConcurrentJobs + ""); } getNumPartitionsXTextField().setText(lso.getNPart(0) + ""); @@ -744,17 +762,23 @@ private void setNewOptions() { return; } try { - LangevinSimulationOptions sso = solverTaskDescription.getLangevinSimulationOptions(); - int numOfParallelLocalRuns = 1; + LangevinSimulationOptions sso = solverTaskDescription.getLangevinSimulationOptions(); + int totalNumberOfJobs = 1; + int numberOfConcurrentJobs = 1; double intervalImage = solverTaskDescription.getLangevinSimulationOptions().getIntervalImage(); double intervalSpring = solverTaskDescription.getLangevinSimulationOptions().getIntervalSpring(); int[] npart = solverTaskDescription.getLangevinSimulationOptions().getNPart(); - if(getMultiRunButton().isSelected()) { // we can get here only on FocusLost event in the numOfTrials text field + if(getMultiRunButton().isSelected()) { // we can get here only on FocusLost event in the totalNumberOfJobs text field + try { + totalNumberOfJobs = Integer.parseInt(getTotalNumberOfJobsJTextField().getText()); + } catch (NumberFormatException ex) { + totalNumberOfJobs = sso.getTotalNumberOfJobs(); + } try { - numOfParallelLocalRuns = Integer.parseInt(getJTextFieldNumOfParallelLocalRuns().getText()); + numberOfConcurrentJobs = Integer.parseInt(getNumberOfConcurrentJobsJTextField().getText()); } catch (NumberFormatException ex) { - numOfParallelLocalRuns = sso.getNumOfParallelLocalRuns(); + numberOfConcurrentJobs = sso.getNumberOfConcurrentJobs(); } } @@ -780,7 +804,8 @@ private void setNewOptions() { // make a copy LangevinSimulationOptions lso = new LangevinSimulationOptions(sso); - lso.setNumOfParallelLocalRuns(numOfParallelLocalRuns); + lso.setTotalNumberOfJobs(totalNumberOfJobs); + lso.setNumberOfConcurrentJobs(numberOfConcurrentJobs); lso.setRandomSeed(randomSeed); lso.setIntervalImage(intervalImage); lso.setIntervalSpring(intervalSpring); diff --git a/vcell-core/src/main/java/cbit/vcell/math/VCML.java b/vcell-core/src/main/java/cbit/vcell/math/VCML.java index 75b946a1fa..afea7780cc 100644 --- a/vcell-core/src/main/java/cbit/vcell/math/VCML.java +++ b/vcell-core/src/main/java/cbit/vcell/math/VCML.java @@ -253,6 +253,9 @@ public class VCML { public final static String LangevinSimulationOptions_Partition_Nx = "PartitionNx"; public final static String LangevinSimulationOptions_Partition_Ny = "PartitionNy"; public final static String LangevinSimulationOptions_Partition_Nz = "PartitionNz"; + public final static String LangevinSimulationOptions_numberOfConcurrentJobs = "NumberOfConcurrentJobs"; + public final static String LangevinSimulationOptions_totalNumberOfJobs = "TotalNumberOfJobs"; + // deprecated, kept for backward compatibility with old .vcml files; use totalNumberOfJobs instead public final static String LangevinSimulationOptions_numOfParallelLocalRuns = "NumOfParallelLocalRuns"; public final static String NFSimSimulationOptions = "NFSimSimulationOptions"; diff --git a/vcell-core/src/main/java/cbit/vcell/solver/LangevinSimulationOptions.java b/vcell-core/src/main/java/cbit/vcell/solver/LangevinSimulationOptions.java index 9cc9136252..24d6c83a88 100644 --- a/vcell-core/src/main/java/cbit/vcell/solver/LangevinSimulationOptions.java +++ b/vcell-core/src/main/java/cbit/vcell/solver/LangevinSimulationOptions.java @@ -27,7 +27,8 @@ public class LangevinSimulationOptions implements Serializable, Matchable, VetoableChangeListener { - // TODO: add the partition definitions in the LangevinOptionsPanel + public final static int DefaultNumberOfConcurrentJobs = 20; // used for multiple runs on the cluster + public final static int DefaultTotalNumberOfJobs = 100; public final static String Partition_Nx = "Partition Nx: "; public final static String Partition_Ny = "Partition Ny: "; @@ -46,7 +47,11 @@ public class LangevinSimulationOptions implements Serializable, Matchable, Vetoa // randomSeed may be null, in which case the solver will generate its own randomSeed as it already does protected BigInteger randomSeed = null; - protected int numOfParallelLocalRuns = 1; // how many instances of the solver run in parallel + // both initialized to 1 - only one job will be run on the cluster + protected int totalNumberOfJobs = 1; // how many jobs will be run on the cluster + protected int numberOfConcurrentJobs = 1; // how many instances of the solver may run concurrently on the cluster +// @Deprecated + protected int numOfParallelLocalRuns = 1; // replaced by numberOfConcurrentJobs but kept for backward compatibility protected double intervalSpring = 1.00E-9; // default: dtspring: 1.00E-9 - from advanced protected double intervalImage = 1.00E-4; // default: dtimage: 1.00E-4 - from advanced @@ -63,7 +68,9 @@ public LangevinSimulationOptions() { public LangevinSimulationOptions(LangevinSimulationOptions langevinSimulationOptions) { this(); randomSeed = langevinSimulationOptions.randomSeed; - numOfParallelLocalRuns = langevinSimulationOptions.numOfParallelLocalRuns; +// numOfParallelLocalRuns = langevinSimulationOptions.numOfParallelLocalRuns; + totalNumberOfJobs = langevinSimulationOptions.totalNumberOfJobs; + numberOfConcurrentJobs = langevinSimulationOptions.numberOfConcurrentJobs; intervalSpring = langevinSimulationOptions.intervalSpring; intervalImage = langevinSimulationOptions.intervalImage; npart[0] = langevinSimulationOptions.npart[0]; @@ -84,7 +91,10 @@ public boolean compareEqual(Matchable obj) { if(randomSeed != langevinSimulationOptions.randomSeed) { return false; } - if(numOfParallelLocalRuns != langevinSimulationOptions.numOfParallelLocalRuns) { + if(totalNumberOfJobs != langevinSimulationOptions.totalNumberOfJobs) { + return false; + } + if(numberOfConcurrentJobs != langevinSimulationOptions.numberOfConcurrentJobs) { return false; } if(intervalSpring != langevinSimulationOptions.intervalSpring) { @@ -102,10 +112,17 @@ public boolean compareEqual(Matchable obj) { } // ----------------------------------------------------------------------------------- - // can be between 0 and numOfTrials-1 - public int getNumOfParallelLocalRuns() { - return numOfParallelLocalRuns; +// @Deprecated +// public int getNumOfParallelLocalRuns() { // // can be between 0 and numOfTrials-1 +// return numOfParallelLocalRuns; +// } + public int getTotalNumberOfJobs() { + return totalNumberOfJobs; } + public int getNumberOfConcurrentJobs() { // // can be between 0 and totalNumberOfJobs-1 + return numberOfConcurrentJobs; + } + public double getIntervalSpring() { return intervalSpring; } @@ -122,8 +139,15 @@ public BigInteger getRandomSeed() { return randomSeed; } - public final void setNumOfParallelLocalRuns(int newValue) { - this.numOfParallelLocalRuns = newValue; +// @Deprecated +// public final void setNumOfParallelLocalRuns(int newValue) { +// this.numOfParallelLocalRuns = newValue; +// } +public final void setTotalNumberOfJobs(int newValue) { + this.totalNumberOfJobs = newValue; +} + public final void setNumberOfConcurrentJobs(int newValue) { + this.numberOfConcurrentJobs = newValue; } public final void setIntervalSpring(double newValue) { this.intervalSpring = newValue; @@ -187,7 +211,8 @@ public String getVCML() { buffer.append("\t\t" + VCML.LangevinSimulationOptions_Partition_Nx + " " + npart[0] + "\n"); buffer.append("\t\t" + VCML.LangevinSimulationOptions_Partition_Ny + " " + npart[1] + "\n"); buffer.append("\t\t" + VCML.LangevinSimulationOptions_Partition_Nz + " " + npart[2] + "\n"); - buffer.append("\t\t" + VCML.LangevinSimulationOptions_numOfParallelLocalRuns + " " + numOfParallelLocalRuns + "\n"); + buffer.append("\t\t" + VCML.LangevinSimulationOptions_numberOfConcurrentJobs + " " + numberOfConcurrentJobs + "\n"); + buffer.append("\t\t" + VCML.LangevinSimulationOptions_totalNumberOfJobs + " " + totalNumberOfJobs + "\n"); buffer.append("\t" + VCML.EndBlock + "\n"); return buffer.toString(); } @@ -211,7 +236,13 @@ public void readVCML(CommentStringTokenizer tokens) throws DataAccessException { randomSeed = new BigInteger(token); } else if(token.equalsIgnoreCase(VCML.LangevinSimulationOptions_numOfParallelLocalRuns)) { token = tokens.nextToken(); - numOfParallelLocalRuns = Integer.parseInt(token); + numOfParallelLocalRuns = Integer.parseInt(token); // not in use anymore, may be present in some old VCML files + } else if(token.equalsIgnoreCase(VCML.LangevinSimulationOptions_totalNumberOfJobs)) { + token = tokens.nextToken(); + totalNumberOfJobs = Integer.parseInt(token); + } else if(token.equalsIgnoreCase(VCML.LangevinSimulationOptions_numberOfConcurrentJobs)) { + token = tokens.nextToken(); + numberOfConcurrentJobs = Integer.parseInt(token); } else if(token.equalsIgnoreCase(VCML.LangevinSimulationOptions_intervalSpring)) { token = tokens.nextToken(); intervalSpring = Double.parseDouble(token); diff --git a/vcell-core/src/main/java/cbit/vcell/solver/SolverLongDesc.java b/vcell-core/src/main/java/cbit/vcell/solver/SolverLongDesc.java index 79960ae175..e01b920a18 100644 --- a/vcell-core/src/main/java/cbit/vcell/solver/SolverLongDesc.java +++ b/vcell-core/src/main/java/cbit/vcell/solver/SolverLongDesc.java @@ -467,7 +467,7 @@ interface SolverLongDesc { "LangevinNoVis is a free, open-source, particle-based, stochastic, simulator platform suitable " + "for modeling mesoscopic systems, which are far too large to be modeled with molecular dynamics " + "but which require more detail than obtainable with macroscopic continuum models. Molecules are " + - "modeled as a collection of impenetrable spheres (called “sites”) linked by stiff springs." + + "modeled as a collection of impenetrable spheres (called 'sites') linked by stiff springs." + ""; static final String COMSOL = diff --git a/vcell-core/src/main/java/cbit/vcell/xml/XMLTags.java b/vcell-core/src/main/java/cbit/vcell/xml/XMLTags.java index 8051c46652..a8693e1dce 100644 --- a/vcell-core/src/main/java/cbit/vcell/xml/XMLTags.java +++ b/vcell-core/src/main/java/cbit/vcell/xml/XMLTags.java @@ -806,8 +806,11 @@ public class XMLTags { public final static String LangevinSO_Partition_Nx = "PartitionNx"; public final static String LangevinSO_Partition_Ny = "PartitionNy"; public final static String LangevinSO_Partition_Nz = "PartitionNz"; - public final static String LangevinSO_numOfParallelLocalRuns = "NumOfParallelLocalRuns"; + public final static String LangevinSO_numberOfConcurrentJobs = "NumberOfConcurrentJobs"; + public final static String LangevinSO_totalNumberOfJobs = "TotalNumberOfJobs"; public final static String LangevinSO_randomSeed = "LangevinRandomSeed"; +// @Deprecated +// public final static String LangevinSO_numOfParallelLocalRuns = "NumOfParallelLocalRuns"; public final static String ParticleInitialConcentrationTag = "ParticleInitialConcentration"; // particle public final static String ParticleDistributionTag = "ParticleDistribution"; // particle diff --git a/vcell-core/src/main/java/cbit/vcell/xml/XmlReader.java b/vcell-core/src/main/java/cbit/vcell/xml/XmlReader.java index c374212ca5..4268115224 100644 --- a/vcell-core/src/main/java/cbit/vcell/xml/XmlReader.java +++ b/vcell-core/src/main/java/cbit/vcell/xml/XmlReader.java @@ -6722,9 +6722,13 @@ private LangevinSimulationOptions getLangevinSimulationOptions(Element langevinS if(temp != null) { lo.setNPart(2, Integer.parseInt(temp)); } - temp = langevinSimulationOptionsElement.getChildText(XMLTags.LangevinSO_numOfParallelLocalRuns, vcNamespace); + temp = langevinSimulationOptionsElement.getChildText(XMLTags.LangevinSO_totalNumberOfJobs, vcNamespace); if(temp != null) { - lo.setNumOfParallelLocalRuns(Integer.parseInt(temp)); + lo.setTotalNumberOfJobs(Integer.parseInt(temp)); + } + temp = langevinSimulationOptionsElement.getChildText(XMLTags.LangevinSO_numberOfConcurrentJobs, vcNamespace); + if(temp != null) { + lo.setNumberOfConcurrentJobs(Integer.parseInt(temp)); } temp = langevinSimulationOptionsElement.getChildText(XMLTags.LangevinSO_randomSeed, vcNamespace); if(temp != null) { diff --git a/vcell-core/src/main/java/cbit/vcell/xml/Xmlproducer.java b/vcell-core/src/main/java/cbit/vcell/xml/Xmlproducer.java index f6fd0fc171..55376095ba 100644 --- a/vcell-core/src/main/java/cbit/vcell/xml/Xmlproducer.java +++ b/vcell-core/src/main/java/cbit/vcell/xml/Xmlproducer.java @@ -4833,8 +4833,12 @@ private Element getXML(LangevinSimulationOptions lso) { e.setText(String.valueOf(lso.getNPart(2))); lsoe.addContent(e); - e = new Element(XMLTags.LangevinSO_numOfParallelLocalRuns); - e.setText(String.valueOf(lso.getNumOfParallelLocalRuns())); + e = new Element(XMLTags.LangevinSO_totalNumberOfJobs); + e.setText(String.valueOf(lso.getTotalNumberOfJobs())); + lsoe.addContent(e); + + e = new Element(XMLTags.LangevinSO_numberOfConcurrentJobs); + e.setText(String.valueOf(lso.getNumberOfConcurrentJobs())); lsoe.addContent(e); if(lso.getRandomSeed() != null) { diff --git a/vcell-core/src/main/java/org/vcell/solver/langevin/LangevinLngvWriter.java b/vcell-core/src/main/java/org/vcell/solver/langevin/LangevinLngvWriter.java index dc54beedea..ca91fa0bef 100644 --- a/vcell-core/src/main/java/org/vcell/solver/langevin/LangevinLngvWriter.java +++ b/vcell-core/src/main/java/org/vcell/solver/langevin/LangevinLngvWriter.java @@ -980,7 +980,7 @@ public static void writeSimulationOptions(Simulation simulation, StringBuilder s int numTrials = std.getNumTrials(); BigInteger randomSeed = lso.getRandomSeed(); - int simultaneousRuns = lso.getNumOfParallelLocalRuns(); + int simultaneousRuns = lso.getNumberOfConcurrentJobs(); // TODO: do not delete this, until we decide on how much info the solver needs // These are not needed but may be nice to have in the future, makes the solver instance more aware @@ -988,7 +988,7 @@ public static void writeSimulationOptions(Simulation simulation, StringBuilder s // sb.append("\n"); // sb.append("Parallel: " + (simultaneousRuns == 1 ? "true" : "false")); // sb.append("\n"); -// sb.append("SimultaneousRuns: " + simultaneousRuns); // this is always 1 on local and quota dependent on server +// sb.append("SimultaneousRuns: " + simultaneousRuns); // this is always 1 on local and slurm dependent on server // sb.append("\n"); // if we don't specify a random seed the solver will do its thing like in the past (use system time in ms) diff --git a/vcell-core/src/test/java/cbit/vcell/pathway/PathwaySearchTest.java b/vcell-core/src/test/java/cbit/vcell/pathway/PathwaySearchTest.java index 6cb50dff4e..d32edf4225 100644 --- a/vcell-core/src/test/java/cbit/vcell/pathway/PathwaySearchTest.java +++ b/vcell-core/src/test/java/cbit/vcell/pathway/PathwaySearchTest.java @@ -9,10 +9,7 @@ import org.jdom2.input.SAXBuilder; import org.jdom2.output.Format; import org.jdom2.output.XMLOutputter; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; @@ -51,6 +48,11 @@ public class PathwaySearchTest { private static final Namespace BP_NS = Namespace.getNamespace("bp", "http://www.biopax.org/release/biopax-level2.owl#"); + private static final String reactomeSite = "https://reactome.org"; + private static final String ncbiSite = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=taxonomy"; + private static final String pathwaysSite = "https://www.pathwaycommons.org"; + + @BeforeAll public static void setUp() throws IOException { @@ -74,6 +76,9 @@ public static void tearDown() { */ @Test public void pathwayDownloadTest() throws MalformedURLException { + // skips the test if the external database is down, no point failing CI over something that's not our fault + Assumptions.assumeTrue(isDatabaseAvailable(reactomeSite), "Reactome is down — skipping test"); + String pathwayId = "5683177"; // Reactome pathway ID pathwayDownload(pathwayId); lg.debug("pathwayDownloadTest - done"); @@ -84,6 +89,7 @@ public void pathwayDownloadTest() throws MalformedURLException { // returns the searchResponse.xml resource @Test public void searchTest() throws IOException { + Assumptions.assumeTrue(isDatabaseAvailable(pathwaysSite), "PathwayCommons is down — skipping test"); String searchText = "Insulin"; String encodedQ = URLEncoder.encode('"' + searchText + '"', StandardCharsets.UTF_8.name()); @@ -117,8 +123,9 @@ public void searchTest() throws IOException { @Test public void fetchTaxonomyNameFromIdTest() { - String taxonId = "9940"; // Ovis aries (sheep) + Assumptions.assumeTrue(isDatabaseAvailable(ncbiSite), "Taxonomy Database is down — skipping test"); + String taxonId = "9940"; // Ovis aries (sheep) try { HttpResponse response = OrganismLookup.fetchTaxonomyResponse(taxonId); assertEquals(200, response.statusCode(), "Unexpected HTTP status"); @@ -399,6 +406,23 @@ private static void writeFilteredPathway(Document doc, String pathwayId, String } } + public static boolean isDatabaseAvailable(String urlString) { + try { + URL url = new URL(urlString); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setConnectTimeout(3000); // 3 seconds + conn.setReadTimeout(3000); + conn.setRequestMethod("GET"); // HEAD is ideal, but some APIs reject it + conn.connect(); + + int code = conn.getResponseCode(); + return code == 200; // true only if return code is HTTP 200 OK + } catch (Exception e) { + return false; + } + } + + // ------------------------------------------------------------------------------------------------------------------ public static void main(String[] args) { diff --git a/vcell-core/src/test/java/cbit/vcell/simdata/LangevinPostProcessorTest.java b/vcell-core/src/test/java/cbit/vcell/simdata/LangevinPostProcessorTest.java index f51613fee6..55baf96b55 100644 --- a/vcell-core/src/test/java/cbit/vcell/simdata/LangevinPostProcessorTest.java +++ b/vcell-core/src/test/java/cbit/vcell/simdata/LangevinPostProcessorTest.java @@ -19,7 +19,7 @@ @Tag("Fast") public class LangevinPostProcessorTest { - // the .IDA files are in vcell-core/src/test/java /cbit/vcell/simdata + // the .IDA files are in vcell-core/src/test/resources /cbit/vcell/simdata static File ida_0_File; static File ida_1_File; static File ida_2_File; diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java b/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java index 180501b0b4..44503e2ca4 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java @@ -49,6 +49,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.vcell.dependency.server.VCellServerModule; +import org.vcell.solver.langevin.LangevinSolver; import org.vcell.util.document.KeyValue; import org.vcell.util.document.User; import org.vcell.util.exe.ExecutableException; @@ -558,22 +559,58 @@ private HtcJobID submit2PBS(SimulationTask simTask, HtcProxy clonedHtcProxy, Pos int ncpus = simTask.getSimulation().getSolverTaskDescription().getNumProcessors(); //CBN? Collection postProcessingCommands = new ArrayList(); - if (realSolver instanceof AbstractCompiledSolver) { - AbstractCompiledSolver compiledSolver = (AbstractCompiledSolver)realSolver; - - List args = new ArrayList<>( 4 ); - args.add( PropertyLoader.getRequiredProperty(PropertyLoader.simulationPreprocessor) ); - args.add( simTaskFilePathExternal ); - args.add( primaryUserDirExternal.getAbsolutePath() ); - if ( chores.isParallel()) { + + boolean is_langevin_batch = (realSolver instanceof LangevinSolver && simTask.getSimulation().getSolverTaskDescription().getNumTrials() > 1); + if (realSolver instanceof AbstractCompiledSolver && !is_langevin_batch) { + AbstractCompiledSolver compiledSolver = (AbstractCompiledSolver) realSolver; + + List args = new ArrayList<>(4); + args.add(PropertyLoader.getRequiredProperty(PropertyLoader.simulationPreprocessor)); + args.add(simTaskFilePathExternal); + args.add(primaryUserDirExternal.getAbsolutePath()); + if (chores.isParallel()) { + args.add(chores.runDirectoryExternal); + } + // compiled solver ...used to be only single executable, now we pass 2 commands to PBSUtils.submitJob that invokes SolverPreprocessor.main() and then the native executable + //the pre-processor command itself is neither messaging nor parallel; it's independent of the subsequent solver call + ExecutableCommand preprocessorCmd = new ExecutableCommand(null, false, false, args); + commandContainer.add(preprocessorCmd); + + for (ExecutableCommand ec : compiledSolver.getCommands()) { + if (ec.isMessaging()) { + ec.addArgument("-tid"); + ec.addArgument(simTask.getTaskID()); + } + commandContainer.add(ec); + } + + if (chores.isCopyNeeded()) { + String logName = chores.finalDataDirectoryInternal + '/' + SimulationData.createCanonicalSimLogFileName(simKey, jobId, false); + CopySimFiles csf = new CopySimFiles(simTask.getSimulationJobID(), chores.runDirectoryInternal, chores.finalDataDirectoryInternal, logName); + postProcessingCommands.add(csf); + } + if (chores.isVtkUser()) { + VtkMeshGenerator vmg = new VtkMeshGenerator(simOwner, simKey, jobId); + postProcessingCommands.add(vmg); + } +// if(chores.isStochMultiTrial()) { +// final String logName = chores.finalDataDirectoryInternal + '/' + SimulationData.createCanonicalSimLogFileName(simKey, jobId, false); +// postProcessingCommands.add(new AvgStochMultiTrial(primaryUserDirInternal.getAbsolutePath(), XmlHelper.simTaskToXML(simTask))); +// } + } else if (realSolver instanceof LangevinSolver langevinSolver && is_langevin_batch){ + List args = new ArrayList<>(4); + args.add(PropertyLoader.getRequiredProperty(PropertyLoader.simulationPreprocessor)); + args.add(simTaskFilePathExternal); + args.add(primaryUserDirExternal.getAbsolutePath()); + if (chores.isParallel()) { args.add(chores.runDirectoryExternal); } // compiled solver ...used to be only single executable, now we pass 2 commands to PBSUtils.submitJob that invokes SolverPreprocessor.main() and then the native executable //the pre-processor command itself is neither messaging nor parallel; it's independent of the subsequent solver call - ExecutableCommand preprocessorCmd = new ExecutableCommand(null, false, false,args); + ExecutableCommand preprocessorCmd = new ExecutableCommand(null, false, false, args); commandContainer.add(preprocessorCmd); - - for (ExecutableCommand ec : compiledSolver.getCommands()) { + + for (ExecutableCommand ec : langevinSolver.getCommands()) { if (ec.isMessaging()) { ec.addArgument("-tid"); ec.addArgument(simTask.getTaskID()); @@ -594,6 +631,7 @@ private HtcJobID submit2PBS(SimulationTask simTask, HtcProxy clonedHtcProxy, Pos // final String logName = chores.finalDataDirectoryInternal + '/' + SimulationData.createCanonicalSimLogFileName(simKey, jobId, false); // postProcessingCommands.add(new AvgStochMultiTrial(primaryUserDirInternal.getAbsolutePath(), XmlHelper.simTaskToXML(simTask))); // } + } else { ExecutableCommand ec = new ExecutableCommand(null, false,false, PropertyLoader.getRequiredProperty(PropertyLoader.javaSimulationExecutable), diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/htc/slurm/SlurmProxy.java b/vcell-server/src/main/java/cbit/vcell/message/server/htc/slurm/SlurmProxy.java index 85c0a086a8..9e01c50ade 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/htc/slurm/SlurmProxy.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/htc/slurm/SlurmProxy.java @@ -14,19 +14,22 @@ import cbit.vcell.server.HtcJobID.BatchSystemType; import cbit.vcell.simdata.PortableCommand; import cbit.vcell.simdata.PortableCommandWrapper; +import cbit.vcell.solver.LangevinSimulationOptions; import cbit.vcell.solver.SolverDescription; +import cbit.vcell.solver.SolverTaskDescription; import cbit.vcell.solvers.AbstractSolver; import cbit.vcell.solvers.ExecutableCommand; import edu.uchc.connjur.wb.LineStringBuilder; import org.vcell.util.document.KeyValue; import org.vcell.util.exe.ExecutableException; -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.StringReader; +import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.*; +import java.util.stream.Collectors; public class SlurmProxy extends HtcProxy { @@ -416,6 +419,282 @@ public String getPostProcessCommands() { } + private static int getRoundedMemoryLimit(long memPerTaskMB) { + int rawLimit = (int)(memPerTaskMB * 0.9); + // Round down to nearest 100 MB + return (rawLimit / 100) * 100; + } + private static String extractUser(ExecutableCommand.Container commandSet) { + for (ExecutableCommand ec: commandSet.getExecCommands()) { + ExecutableCommand.Container commandSet2 = new ExecutableCommand.Container(); + if(ec.getCommands().get(0).equals("JavaPreprocessor64")) { + continue; + }else { + for(String command : ec.getCommands()) { + if(command.startsWith("-Duser=")) { + return command.substring(7); + } + else if(command.contains("langevinInput")) { + // Langevin input file is of the form /some/path/users/... + String[] pathParts = command.split("/"); + for(int i=0; i= 0 required"); + if (numberOfConcurrentTasks <= 0) throw new IllegalArgumentException("numberOfConcurrentTasks > 0 required"); + if (timeoutSeconds <= 0) throw new IllegalArgumentException("timeoutSeconds > 0 required"); + + int perTaskMinutes = (timeoutSeconds + 59) / 60; // ceiling(timeoutSeconds/60) + int batches = (totalNumberOfJobs + numberOfConcurrentTasks - 1) / numberOfConcurrentTasks; + long workMinutes = (long) batches * perTaskMinutes; + long extraMinutes = 3L * perTaskMinutes; + long totalMinutes = workMinutes + extraMinutes; + long cushionedMinutes = (long) Math.ceil(totalMinutes * 1.10); + + long totalHours = cushionedMinutes / 60; + long minutes = cushionedMinutes % 60; + + if (totalHours < 100) { + return String.format("%02d:%02d:00", totalHours, minutes); + } else { + long days = totalHours / 24; + long hours = totalHours % 24; + return String.format("%d-%02d:%02d:00", days, hours, minutes); + } + } + private void writeScriptControlledVariables(LineStringBuilder lsb, String jobName, String userId, + SimulationTask simTask, int jobTimeoutSeconds) { + String simId = simTask.getSimulationInfo().getSimulationVersion().getVersionKey().toString(); + int totalJobs = simTask.getSimulation().getSolverTaskDescription().getLangevinSimulationOptions().getTotalNumberOfJobs(); + String htcLogDir = PropertyLoader.getRequiredProperty(PropertyLoader.htcLogDirExternal); + String simDataDir = PropertyLoader.getRequiredProperty(PropertyLoader.primarySimDataDirExternalProperty); + int lastUnderscore = jobName.lastIndexOf('_'); + String trimmedJobName = (lastUnderscore >= 0) ? jobName.substring(0, lastUnderscore + 1) : jobName; + String logFilePath = htcLogDir + "/" + trimmedJobName + ".submit.log"; + String messagingConfigFilePath = simDataDir + "/" + userId + "/SimID_" + simId + "_0_.langevinMessagingConfig"; + + lsb.write("# Script-controlled variables (populated by generator in real use)"); + lsb.write("USERID=" + userId); + lsb.write("SIM_ID=" + simId); + lsb.write("TOTAL_JOBS=" + totalJobs + " # to be set by generator to lso.getTotalNumberOfJobs()"); + lsb.write("JOB_TIMEOUT_SECONDS=" + jobTimeoutSeconds + " # per-job timeout (seconds), adjust per generator"); + lsb.write("LOG_FILE=\"" + logFilePath + "\""); + lsb.write("MESSAGING_CONFIG_FILE=\"" + messagingConfigFilePath + "\""); + lsb.write(""); + + lsb.write("# Truncate / delete various logs and the solver input file, to start clean"); + lsb.write(": > " + htcLogDir + "/V_TEST2_${SIM_ID}_0_.slurm.log"); + lsb.write("rm -f " + simDataDir + "/${USERID}/SimID_${SIM_ID}_0_*.log"); + lsb.write("rm -f " + simDataDir + "/${USERID}/SimID_${SIM_ID}_0__*.ida"); + lsb.write("rm -f " + simDataDir + "/${USERID}/SimID_${SIM_ID}_0__*.json"); + lsb.write("rm -f " + simDataDir + "/${USERID}/SimID_${SIM_ID}_0_.functions"); + lsb.write("rm -f " + simDataDir + "/${USERID}/SimID_${SIM_ID}_0_.langevinInput"); + lsb.write("rm -f " + simDataDir + "/${USERID}/SimID_${SIM_ID}_0_.langevinMessagingConfig"); + lsb.write(""); + } + private void writeSingularitySetup(LineStringBuilder lsb) { + String slurmTmpDir = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_tmpdir); + String singularityCachedir = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_singularity_cachedir); + String singularityPullfolder = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_singularity_pullfolder); + String singularityModuleName = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_singularity_module_name); + + lsb.write("echo \"=== Singularity check BEFORE module load ===\""); + lsb.write("if command -v singularity >/dev/null 2>&1; then"); + lsb.write(" echo \"Singularity found at: $(command -v singularity)\""); + lsb.write(" singularity --version"); + lsb.write("else"); + lsb.write(" echo \"Singularity not found before module load\""); + lsb.write("fi"); + lsb.write(""); + + lsb.write("TMPDIR=" + slurmTmpDir); + lsb.write("if [ ! -e $TMPDIR ]; then mkdir -p $TMPDIR ; fi"); + lsb.write("echo `hostname`"); + lsb.write("export MODULEPATH=/isg/shared/modulefiles:/tgcapps/modulefiles"); + lsb.write("if [ -f /usr/share/modules/init/bash ]; then"); + lsb.write(" source /usr/share/modules/init/bash"); + lsb.write(" module load " + singularityModuleName); + lsb.write("else"); + lsb.write(" echo \"[Warning] Module init script not found - skipping module setup\""); + lsb.write("fi"); + lsb.write("export SINGULARITY_CACHEDIR=" + singularityCachedir); + lsb.write("export SINGULARITY_PULLFOLDER=" + singularityPullfolder); + lsb.write(""); + + lsb.write("echo \"=== Singularity check AFTER module load ===\""); + lsb.write("if command -v singularity >/dev/null 2>&1; then"); + lsb.write(" echo \"Singularity found at: $(command -v singularity)\""); + lsb.write(" singularity --version"); + lsb.write("else"); + lsb.write(" echo \"Singularity not found after module load\""); + lsb.write(" exit 127"); + lsb.write("fi"); + lsb.write(""); + } + private void writeSlurmJobMetadata(LineStringBuilder lsb) { + lsb.write("# Compute memory per task and per job"); + lsb.write("MEM_TASK=$(( SLURM_MEM_PER_CPU * SLURM_CPUS_PER_TASK ))"); + lsb.write("MEM_JOB=$(( MEM_TASK * SLURM_NTASKS ))"); + lsb.write(""); + + lsb.write("echo \"======= SLURM job started =======\""); + lsb.write("echo \"Hostname : $(hostname -f)\""); + lsb.write("echo \"User : $USERID\""); + lsb.write("echo \"Sim ID : $SIM_ID\""); + lsb.write("echo \"id : $(id)\""); + lsb.write("echo \"Total Jobs : $TOTAL_JOBS\""); + lsb.write("echo \"Job Timeout : $JOB_TIMEOUT_SECONDS\""); + lsb.write("echo \"Slurm Job ID : $SLURM_JOB_ID\""); + lsb.write("echo \"Slurm Job Name : $SLURM_JOB_NAME\""); + lsb.write("echo \"Start Time : $(date)\""); + lsb.write("echo \"Working Dir : $(pwd)\""); + lsb.write("echo \"Node List : $SLURM_NODELIST\""); + lsb.write("echo \"CPUs per task : $SLURM_CPUS_PER_TASK\""); + lsb.write("echo \"Mem. per task : ${MEM_TASK} MB total\""); + lsb.write("echo \"Mem. per job : ${MEM_JOB} MB total\""); + lsb.write("echo \"Environment snapshot:\""); + lsb.write("env"); + lsb.write("echo \"=================================\""); + lsb.write(""); + } + private void writeContainerBindingsAndEnv(LineStringBuilder lsb, int javaMemXmx) { + String primaryDataDir = PropertyLoader.getRequiredProperty(PropertyLoader.primarySimDataDirExternalProperty); + String secondaryDataDir = PropertyLoader.getRequiredProperty(PropertyLoader.secondarySimDataDirExternalProperty); + String archiveDataDirHost = PropertyLoader.getRequiredProperty(PropertyLoader.simDataDirArchiveExternal); + String archiveDataDirContainer = PropertyLoader.getRequiredProperty(PropertyLoader.simDataDirArchiveInternal); + String htclogDir = PropertyLoader.getRequiredProperty(PropertyLoader.htcLogDirExternal); + String scratchDir = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_tmpdir); + + lsb.write("container_bindings=\"--bind " + primaryDataDir + ":/simdata \""); + lsb.write("container_bindings+=\"--bind " + secondaryDataDir + ":/simdata_secondary \""); + lsb.write("container_bindings+=\"--bind " + archiveDataDirHost + ":" + archiveDataDirContainer + " \""); + lsb.write("container_bindings+=\"--bind " + htclogDir + ":/htclogs \""); + lsb.write("container_bindings+=\"--bind " + scratchDir + ":/solvertmp \""); + lsb.write(""); + + String jmsHost = PropertyLoader.getRequiredProperty(PropertyLoader.jmsSimHostExternal); + String jmsPort = PropertyLoader.getRequiredProperty(PropertyLoader.jmsSimPortExternal); + String jmsRestPort = PropertyLoader.getRequiredProperty(PropertyLoader.jmsSimRestPortExternal); + String jmsUser = PropertyLoader.getRequiredProperty(PropertyLoader.jmsUser); + String jmsPswd = PropertyLoader.getSecretValue(PropertyLoader.jmsPasswordValue, PropertyLoader.jmsPasswordFile); + String jmsBlobMinSize = PropertyLoader.getProperty(PropertyLoader.jmsBlobMessageMinSize, "100000"); + String mongoHost = PropertyLoader.getRequiredProperty(PropertyLoader.mongodbHostExternal); + String mongoPort = PropertyLoader.getRequiredProperty(PropertyLoader.mongodbPortExternal); + String mongoDB = PropertyLoader.getRequiredProperty(PropertyLoader.mongodbDatabase); + String softwareVersion = PropertyLoader.getRequiredProperty(PropertyLoader.vcellSoftwareVersion); + String serverId = PropertyLoader.getRequiredProperty(PropertyLoader.vcellServerIDProperty); + + lsb.write("container_env=\"--env java_mem_Xmx=" + javaMemXmx + "M \""); + lsb.write("container_env+=\"--env jmshost_sim_internal=" + jmsHost + " \""); + lsb.write("container_env+=\"--env jmsport_sim_internal=" + jmsPort + " \""); + lsb.write("container_env+=\"--env jmsrestport_sim_internal=" + jmsRestPort + " \""); + lsb.write("container_env+=\"--env jmsuser=" + jmsUser + " \""); + lsb.write("container_env+=\"--env jmspswd=" + jmsPswd + " \""); + lsb.write("container_env+=\"--env jmsblob_minsize=" + jmsBlobMinSize + " \""); + lsb.write("container_env+=\"--env mongodbhost_internal=" + mongoHost + " \""); + lsb.write("container_env+=\"--env mongodbport_internal=" + mongoPort + " \""); + lsb.write("container_env+=\"--env mongodb_database=" + mongoDB + " \""); + lsb.write("container_env+=\"--env primary_datadir_external=" + primaryDataDir + " \""); + lsb.write("container_env+=\"--env secondary_datadir_external=" + secondaryDataDir + " \""); + lsb.write("container_env+=\"--env htclogdir_external=" + htclogDir + " \""); + lsb.write("container_env+=\"--env softwareVersion=" + softwareVersion + " \""); + lsb.write("container_env+=\"--env serverid=" + serverId + " \""); + lsb.write(""); + } + private void writeContainerImageAndPrefixes(LineStringBuilder lsb) { + // Resolve docker image names + final String batchDockerName = PropertyLoader.getRequiredProperty(PropertyLoader.htc_vcellbatch_docker_name); + final String solverDockerName = batchDockerName; + + // Write out docker names and prefixes + lsb.write("# Full solver command"); + lsb.write("solver_docker_name=" + solverDockerName); + lsb.write("solver_container_prefix=\"singularity run --containall " + + "${container_bindings} ${container_env} docker://${solver_docker_name}\""); + + if (batchDockerName != null && !batchDockerName.isEmpty()) { + lsb.write("batch_docker_name=" + batchDockerName); + lsb.write("batch_container_prefix=\"singularity run --containall " + + "${container_bindings} ${container_env} docker://${batch_docker_name}\""); + } + + lsb.write("slurm_prefix=\"srun -N1 -n1 -c${SLURM_CPUS_PER_TASK}\""); + lsb.write(""); + } + String generateLangevinBatchScript(String jobName, ExecutableCommand.Container commandSet, double memSizeMB, + Collection postProcessingCommands, SimulationTask simTask) { + + // TODO: extractUser is very unrobust, must be fixed + // it may be the userName can be obtained like so: String vcellUserid = simTask.getUser().getName(); + String userName = extractUser(commandSet); + String vcellUserid = simTask.getUser().getName(); + KeyValue simID = simTask.getSimulationInfo().getSimulationVersion().getVersionKey(); + SolverTaskDescription std = simTask.getSimulation().getSolverTaskDescription(); + LangevinSimulationOptions lso = std.getLangevinSimulationOptions(); + int totalNumberOfJobs = lso.getTotalNumberOfJobs(); + int numberOfConcurrentTasks = lso.getNumberOfConcurrentJobs(); + SolverDescription solverDescription = std.getSolverDescription(); + MemLimitResults memoryMBAllowed = HtcProxy.getMemoryLimit(vcellUserid, simID, solverDescription, memSizeMB, simTask.isPowerUser()); + + int timeoutPerTaskSeconds = 28800; // 8 hours TODO: do we hardcode this? Should it be part of LangevinSimulationOptions? + String slurmJobTimeout = computeSlurmTimeLimit(totalNumberOfJobs, numberOfConcurrentTasks, timeoutPerTaskSeconds); + int javaMemXmx = getRoundedMemoryLimit(memoryMBAllowed.getMemLimit()); + + // ------------------------------------------------------------- + + LineStringBuilder lsb = new LineStringBuilder(); + slurmBatchScriptInit(jobName, simTask.isPowerUser(), memoryMBAllowed, numberOfConcurrentTasks, slurmJobTimeout, lsb); + writeScriptControlledVariables(lsb, jobName, userName, simTask, timeoutPerTaskSeconds); + writeSingularitySetup(lsb); + writeSlurmJobMetadata(lsb); + writeContainerBindingsAndEnv(lsb, javaMemXmx); + writeContainerImageAndPrefixes(lsb); + + String langevinFixture; + try { + langevinFixture = readTextFileFromResource("slurm/templates/langevinFixture.slurm.sub"); + } catch (IOException ex) { + throw new IllegalStateException("Failed to load orchestration fixture", ex); + } + lsb.write(langevinFixture); + + Charset asciiCharset = StandardCharsets.US_ASCII; + CharsetEncoder encoder = asciiCharset.newEncoder(); + String langevinBatchString = lsb.sb.toString(); + for (int i = 0; i < langevinBatchString.length(); i++) { + char c = langevinBatchString.charAt(i); + if (!encoder.canEncode(c)) { + System.err.printf("Unmappable character at index %d: U+%04X (%c)%n", i, (int) c, c); + } + } + return langevinBatchString; + } + SbatchSolverComponents generateScript(String jobName, ExecutableCommand.Container commandSet, double memSizeMB, Collection postProcessingCommands, SimulationTask simTask) { //SlurmProxy ultimately instantiated from {vcellroot}/docker/build/Dockerfile-submit-dev by way of cbit.vcell.message.server.batch.sim.HtcSimulationWorker @@ -537,7 +816,6 @@ SbatchSolverComponents generateScript(String jobName, ExecutableCommand.Containe return new SbatchSolverComponents(slurmCommands.sb.toString(), lsb.sb.toString(),preProcessLSB.sb.toString(),singularityLSB.sb.toString(),callExitLSB.sb.toString(),sendFailMsgLSB.sb.toString(),exitLSB.sb.toString(),postProcessLSB.sb.toString()); } - private void callExitScript(ExecutableCommand.Container commandSet, LineStringBuilder lsb) { lsb.newline(); ExecutableCommand exitCmd = commandSet.getExitCodeCommand(); @@ -656,8 +934,16 @@ private void slurmInitSingularity(LineStringBuilder lsb, private void slurmScriptInit(String jobName, boolean bPowerUser, MemLimitResults memoryMBAllowed, LineStringBuilder lsb) { + String os = System.getProperty("os.name").toLowerCase(); + boolean isWindows = os.startsWith("windows"); + lsb.write("#!/usr/bin/bash"); File htcLogDirExternal = new File(PropertyLoader.getRequiredProperty(PropertyLoader.htcLogDirExternal)); + String logPath = new File(htcLogDirExternal, jobName + ".slurm.log").getAbsolutePath(); + if(isWindows) { + logPath = logPath.replaceAll("[A-Za-z]:", "").replace("\\", "/"); + } + if(bPowerUser) { String partition_pu = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_partition_pu); String reservation_pu = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_reservation_pu); @@ -674,8 +960,8 @@ private void slurmScriptInit(String jobName, boolean bPowerUser, MemLimitResults lsb.write("#SBATCH --qos=" +qos); } lsb.write("#SBATCH -J " + jobName); - lsb.write("#SBATCH -o " + new File(htcLogDirExternal, jobName+".slurm.log").getAbsolutePath()); - lsb.write("#SBATCH -e " + new File(htcLogDirExternal, jobName+".slurm.log").getAbsolutePath()); + lsb.write("#SBATCH -o " + logPath); + lsb.write("#SBATCH -e " + logPath); lsb.write("#SBATCH --mem="+memoryMBAllowed.getMemLimit()+"M"); lsb.write("#SBATCH --no-kill"); lsb.write("#SBATCH --no-requeue"); @@ -686,33 +972,94 @@ private void slurmScriptInit(String jobName, boolean bPowerUser, MemLimitResults lsb.write("# VCell SlurmProxy memory limit source='"+memoryMBAllowed.getMemLimitSource()+"'"); } + private void slurmBatchScriptInit(String jobName, boolean isPowerUser, MemLimitResults memoryMBAllowed, + int numberOfConcurrentTasks, String jobTimeout, LineStringBuilder lsb) { + lsb.write("#!/usr/bin/bash"); + + if (isPowerUser) { + String partitionPU = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_partition_pu); + String qosPU = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_qos_pu); + lsb.write("#SBATCH --partition=" + partitionPU); + lsb.write("#SBATCH --reservation="); // intentionally blank + lsb.write("#SBATCH --qos=" + qosPU); + } else { + String partition = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_partition); + String reservation = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_reservation); + String qos = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_qos); + lsb.write("#SBATCH --partition=" + partition); + lsb.write("#SBATCH --reservation=" + reservation); + lsb.write("#SBATCH --qos=" + qos); + } + + lsb.write("#SBATCH -J " + jobName); + + String htcLogDir = PropertyLoader.getRequiredProperty(PropertyLoader.htcLogDirExternal); + int lastUnderscore = jobName.lastIndexOf('_'); + String trimmedJobName = (lastUnderscore >= 0) ? jobName.substring(0, lastUnderscore + 1) : jobName; + String logPath = new File(htcLogDir, trimmedJobName + ".slurm.log").getPath().replace("\\", "/"); + + lsb.write("#SBATCH -o " + logPath); + lsb.write("#SBATCH -e " + logPath); + lsb.write("#SBATCH --ntasks=" + numberOfConcurrentTasks + "\t\t\t# number of concurrent tasks"); + // TODO: hardcoded for now, adjust if needed + lsb.write("#SBATCH --cpus-per-task=1"); + // TODO: mem per cpu needs to be adjusted, 2M should be enough for most Langevin tasks + lsb.write("#SBATCH --mem-per-cpu=" + memoryMBAllowed.getMemLimit() + "M"); + lsb.write("#SBATCH --nodes=1"); + lsb.write("#SBATCH --time=" + jobTimeout + "\t\t# timeout for the entire job"); + lsb.write("#SBATCH --no-kill"); + lsb.write("#SBATCH --no-requeue"); + + lsb.write(""); // blank line before shell options + lsb.write("set -o errexit"); + lsb.write("set -o pipefail"); + lsb.write("set -o nounset"); + lsb.write("set +e"); + lsb.write(""); + } + @Override public HtcJobID submitJob(String jobName, File sub_file_as_internal_path, File sub_file_with_external_path, ExecutableCommand.Container commandSet, int ncpus, double memSizeMB, Collection postProcessingCommands, SimulationTask simTask,File primaryUserDirExternal) throws ExecutableException, IOException { - String scriptText = createJobScriptText(jobName, commandSet, ncpus, memSizeMB, postProcessingCommands, simTask); + SolverTaskDescription std = simTask.getSimulationJob().getSimulation().getSolverTaskDescription(); + String scriptText; + if(std.getSolverDescription().isLangevinSolver() && std.getLangevinSimulationOptions().getTotalNumberOfJobs() > 1) { + scriptText = createBatchJobScriptText(jobName, commandSet, ncpus, memSizeMB, postProcessingCommands, simTask); + } else { + scriptText = createJobScriptText(jobName, commandSet, ncpus, memSizeMB, postProcessingCommands, simTask); + } Files.writeString(sub_file_as_internal_path.toPath(), scriptText); return submitJobFile(sub_file_with_external_path); } String createJobScriptText(String jobName, ExecutableCommand.Container commandSet, int ncpus, double memSizeMB, Collection postProcessingCommands, SimulationTask simTask) throws IOException { - if (LG.isDebugEnabled()) { - LG.debug("generating local SLURM submit script for jobName="+jobName); - } - SlurmProxy.SbatchSolverComponents sbatchSolverComponents = generateScript(jobName, commandSet, memSizeMB, postProcessingCommands, simTask); - - StringBuilder scriptContent = new StringBuilder(); - scriptContent.append(sbatchSolverComponents.getSingularityCommands()); - scriptContent.append(sbatchSolverComponents.getSendFailureMsgCommands()); - scriptContent.append(sbatchSolverComponents.getCallExitCommands()); - scriptContent.append(sbatchSolverComponents.getPreProcessCommands()); - scriptContent.append(sbatchSolverComponents.solverCommands); - scriptContent.append(sbatchSolverComponents.getExitCommands()); - String substitutedSbatchCommands = sbatchSolverComponents.getSbatchCommands(); - String origScriptText = substitutedSbatchCommands+"\n\n"+ - scriptContent.toString()+"\n\n"+ - "#Following commands (if any) are read by JavaPostProcessor64\n"+ - sbatchSolverComponents.postProcessCommands+"\n"; - String scriptText = toUnixStyleText(origScriptText); - return scriptText; + if (LG.isDebugEnabled()) { + LG.debug("generating local SLURM submit script for jobName="+jobName); + } + SlurmProxy.SbatchSolverComponents sbatchSolverComponents = generateScript(jobName, commandSet, memSizeMB, postProcessingCommands, simTask); + + StringBuilder scriptContent = new StringBuilder(); + scriptContent.append(sbatchSolverComponents.getSingularityCommands()); + scriptContent.append(sbatchSolverComponents.getSendFailureMsgCommands()); + scriptContent.append(sbatchSolverComponents.getCallExitCommands()); + scriptContent.append(sbatchSolverComponents.getPreProcessCommands()); + scriptContent.append(sbatchSolverComponents.solverCommands); + scriptContent.append(sbatchSolverComponents.getExitCommands()); + String substitutedSbatchCommands = sbatchSolverComponents.getSbatchCommands(); + String origScriptText = substitutedSbatchCommands+"\n\n"+ + scriptContent.toString()+"\n\n"+ + "#Following commands (if any) are read by JavaPostProcessor64\n"+ + sbatchSolverComponents.postProcessCommands+"\n"; + String scriptText = toUnixStyleText(origScriptText); + return scriptText; + } + + String createBatchJobScriptText(String jobName, ExecutableCommand.Container commandSet, int ncpus, double memSizeMB, Collection postProcessingCommands, SimulationTask simTask) throws IOException { + if (LG.isDebugEnabled()) { + LG.debug("generating local SLURM submit script for jobName="+jobName); + } + String origScriptText = generateLangevinBatchScript(jobName, commandSet, memSizeMB, postProcessingCommands, simTask); + String scriptText = toUnixStyleText(origScriptText); + return scriptText; } HtcJobID submitJobFile(File sub_file_external) throws ExecutableException { @@ -806,5 +1153,16 @@ String createOptJobScript(String jobName, File optProblemInputFile, File optProb return toUnixStyleText(lsb.toString()); } + private String readTextFileFromResource(String filename) throws IOException { + InputStream inputStream = getClass().getClassLoader().getResourceAsStream(filename); + if (inputStream == null) { + throw new IOException("Resource not found: " + filename); + } + String xmlString; + try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) { + xmlString = reader.lines().collect(Collectors.joining(System.lineSeparator())); + } + return xmlString; + } } diff --git a/vcell-server/src/main/resources/slurm/templates/langevinFixture.slurm.sub b/vcell-server/src/main/resources/slurm/templates/langevinFixture.slurm.sub new file mode 100644 index 0000000000..4df3e9ce1d --- /dev/null +++ b/vcell-server/src/main/resources/slurm/templates/langevinFixture.slurm.sub @@ -0,0 +1,226 @@ +# === From here down, the code is identical for all jobs and should be copied from a resource file === + +echo "Job Execution Log" > "${LOG_FILE}" +echo "------------------" >> "${LOG_FILE}" +echo "=== [$(date)] === SLURM job ${SLURM_JOB_ID} started on $(hostname)" | tee -a "${LOG_FILE}" + +SIMXML="/simdata/${USERID}/SimID_${SIM_ID}_0__0.simtask.xml" # preprocessor XML path (per-user) +echo "[Preprocess] Running JavaPreprocessor64 ${SIMXML}" | tee -a "${LOG_FILE}" + +# run JavaPreprocessor64 inside batch container (use generated batch prefix) +${batch_container_prefix} JavaPreprocessor64 "${SIMXML}" "/simdata/${USERID}" +stat=$? +echo "JavaPreprocessor64 returned ${stat}" | tee -a "${LOG_FILE}" +if [ $stat -ne 0 ]; then + ${batch_container_prefix} JavaPostprocessor64 "${SIM_ID}" "${USERID}" 17 0 0 $stat "/htclogs/${SLURM_JOB_NAME}.slurm.sub" || true + echo "Preprocessor failed; exiting ${stat}" | tee -a "${LOG_FILE}" + exit $stat +fi + +# ---------------------------------------------------------------------------------------------- + +echo "Parse the messaging config file" +broker_host=""; broker_port=""; broker_username=""; broker_password="" +vc_username=""; simKey=""; taskID=""; jobIndex="" +while IFS="=" read -r key value; do + case "$key" in + broker_host) broker_host="$value" ;; + broker_port) broker_port="$value" ;; + broker_username) broker_username="$value" ;; + broker_password) broker_password="$value" ;; + vc_username) vc_username="$value" ;; + simKey) simKey="$value" ;; + taskID) taskID="$value" ;; + jobIndex) jobIndex="$value" ;; + esac +done < "$MESSAGING_CONFIG_FILE" + +echo "Parsed configuration:" +echo " broker_host = $broker_host" +echo " broker_port = $broker_port" +echo " broker_username = $broker_username" +echo " broker_password = $broker_password" +echo " vc_username = $vc_username" +echo " simKey = $simKey" +echo " taskID = $taskID" +echo " jobIndex = $jobIndex" + +statusCode=1001 +statusMsg="Running" +# Immutable part, built once +BASE_PROPERTIES="JMSDeliveryMode=persistent&JMSTimeToLive=3000" +BASE_PROPERTIES+="&SimKey=${simKey}" +BASE_PROPERTIES+="&JobIndex=${jobIndex}" +BASE_PROPERTIES+="&TaskID=${taskID}" +BASE_PROPERTIES+="&UserName=${vc_username}" +BASE_PROPERTIES+="&MessageType=WorkerEvent" +BASE_PROPERTIES+="&HostName=$(hostname)" + +INPUT_DIR="/simdata/${USERID}" # on singularity, each solver instance reads inputs from here +LOG_DIR="/simdata/${USERID}" # on singularity, each solver instance writes logs here +declare -A job_pid_map +job_pids=() +running_jobs=0 +any_fail=0 +max_concurrent_jobs=${SLURM_NTASKS:-4} # actually these are tasks +echo "max_concurrent_jobs: ${max_concurrent_jobs}" >> "${LOG_FILE}" +echo "TOTAL_JOBS to launch: ${TOTAL_JOBS}" >> "${LOG_FILE}" + +finished_jobs=0 # messaging counters +last_notify_time=0 +MIN_NOTIFY_INTERVAL=10 # seconds + +for ((i = 0; i < TOTAL_JOBS; i++)); do + echo "Task $i starting at $(date)" >> "${LOG_FILE}" # log task start + + ( # run each task in parallel + timeout "${JOB_TIMEOUT_SECONDS}s" \ + ${slurm_prefix} ${solver_container_prefix} \ + langevin_x64 simulate \ + --output-log="${LOG_DIR}/SimID_${SIM_ID}_0_${i}.log" \ + "${INPUT_DIR}/SimID_${SIM_ID}_0_.langevinInput" \ + "$i" \ + -tid 0 + ) & + pid=$! # capture the task PID + echo "PID is $pid" >> "${LOG_FILE}" + job_pids+=($pid) # store the PID + job_pid_map[$pid]=$i # map task index to PID + ((running_jobs++)) # increment running task count + echo "currently running jobs: ${running_jobs}" >> "${LOG_FILE}" + + # wait for a finished task before launching a new one if we hit the concurrency limit + while (( running_jobs >= max_concurrent_jobs )); do + for idx in "${!job_pids[@]}"; do + pid="${job_pids[$idx]}" + if [[ -z "${pid}" ]]; then continue; fi + if ! kill -0 "$pid" 2>/dev/null; then # check if process is still running + wait "$pid" # + exit_code=$? + job_index=${job_pid_map[$pid]:-unknown} # retrieve original task index + echo "Task $job_index with pid ${pid} finished with exit code $exit_code at $(date)" >> "${LOG_FILE}" + unset "job_pids[$idx]" # remove PID from active PID list + unset "job_pid_map[$pid]" # remove mapping + ((running_jobs--)) + ((finished_jobs++)) + progress=$(awk "BEGIN {print ${finished_jobs}/(${TOTAL_JOBS}+1)}") # compute progress + timepoint=$(date +%s) + if (( timepoint - last_notify_time >= MIN_NOTIFY_INTERVAL )); then + last_notify_time=$timepoint + statusCode=1001 + statusMsg="Running" + RUNTIME_PROPERTIES="&WorkerEvent_Status=${statusCode}" + RUNTIME_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" + RUNTIME_PROPERTIES+="&WorkerEvent_TimePoint=${timepoint}" + RUNTIME_PROPERTIES+="&WorkerEvent_Progress=${progress}" + PROPERTIES="${BASE_PROPERTIES}${RUNTIME_PROPERTIES}" + msgCommand="set -o errexit; set -o pipefail; set -o nounset" + msgCommand+=" + curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" + ${solver_container_prefix} /bin/bash -c "$msgCommand" # execute inside singularity + echo "progress notification sent, ${progress} done" >> "${LOG_FILE}" + fi + if [[ $exit_code -ne 0 ]]; then any_fail=1; fi + break # exit for-loop once we free up a slot + fi + done + sleep 1 # allow brief pause before rechecking + done +done + +# Final wait for remaining tasks +for pid in "${job_pids[@]}"; do + if [[ -z "${pid}" ]]; then continue; fi + wait "$pid" + exit_code=$? + job_index=${job_pid_map[$pid]:-unknown} + echo "Task $job_index finished with exit code $exit_code at $(date)" >> "${LOG_FILE}" + ((finished_jobs++)) + progress=$(awk "BEGIN {print ${finished_jobs}/(${TOTAL_JOBS}+1)}") # compute progress + timepoint=$(date +%s) + if (( timepoint - last_notify_time >= MIN_NOTIFY_INTERVAL )); then + last_notify_time=$timepoint + statusCode=1001 + statusMsg="Running" + RUNTIME_PROPERTIES="&WorkerEvent_Status=${statusCode}" + RUNTIME_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" + RUNTIME_PROPERTIES+="&WorkerEvent_TimePoint=${timepoint}" + RUNTIME_PROPERTIES+="&WorkerEvent_Progress=${progress}" + PROPERTIES="${BASE_PROPERTIES}${RUNTIME_PROPERTIES}" + msgCommand="set -o errexit; set -o pipefail; set -o nounset" + msgCommand+=" + curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" + ${solver_container_prefix} /bin/bash -c "$msgCommand" # execute inside singularity + echo "progress notification sent, ${progress} done" >> "${LOG_FILE}" + fi + if [[ $exit_code -ne 0 ]]; then any_fail=1; fi +done +echo "Batch job completed at $(date)" >> "${LOG_FILE}" + +# ---------------------------------------------------------------------------------------------- +# postprocess solver invocation (runs after all simulations finish) +echo "Starting the last job (postprocess) at $(date)" >> "${LOG_FILE}" +timeout "${JOB_TIMEOUT_SECONDS}s" \ + ${slurm_prefix} ${solver_container_prefix} \ + langevin_x64 postprocess \ + "${INPUT_DIR}/SimID_${SIM_ID}_0_.langevinInput" \ + ${TOTAL_JOBS} \ + --output-log="${LOG_DIR}/SimID_${SIM_ID}_0_P.log" \ + --vc-print-status & + +last_pid=$! +wait $last_pid +exit_code=$? +echo "Task 'Last' with PID $last_pid finished with exit code $exit_code at $(date)" >> "${LOG_FILE}" +timepoint=$(date +%s) +progress="1.0" +statusCode=1001 +statusMsg="Running" +RUNTIME_PROPERTIES="&WorkerEvent_Status=${statusCode}" +RUNTIME_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" +RUNTIME_PROPERTIES+="&WorkerEvent_TimePoint=${timepoint}" +RUNTIME_PROPERTIES+="&WorkerEvent_Progress=${progress}" +PROPERTIES="${BASE_PROPERTIES}${RUNTIME_PROPERTIES}" +msgCommand="set -o errexit; set -o pipefail; set -o nounset" +msgCommand+=" +curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" +${solver_container_prefix} /bin/bash -c "$msgCommand" # execute inside singularity +echo "progress notification sent, ${progress} done" >> "${LOG_FILE}" +statusCode=1003 +statusMsg="Finished" +RUNTIME_PROPERTIES="&WorkerEvent_Status=${statusCode}" +RUNTIME_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" +RUNTIME_PROPERTIES+="&WorkerEvent_TimePoint=${timepoint}" +RUNTIME_PROPERTIES+="&WorkerEvent_Progress=${progress}" +PROPERTIES="${BASE_PROPERTIES}${RUNTIME_PROPERTIES}" +msgCommand="set -o errexit; set -o pipefail; set -o nounset" +msgCommand+=" +curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" +${solver_container_prefix} /bin/bash -c "$msgCommand" # execute inside singularity +echo "FINISHED notification sent" >> "${LOG_FILE}" +echo "The final task finished at $(date)" >> "${LOG_FILE}" +echo "All tasks completed at $(date)" >> "${LOG_FILE}" + +# ---------------------------------------------------------------------------------------------- +# run JavaPostprocessor64 inside batch container +echo "[Postprocess] Running JavaPostprocessor64..." | tee -a "${LOG_FILE}" +timeout 20s ${batch_container_prefix} JavaPostprocessor64 "${SIM_ID}" "${USERID}" 17 0 0 "${TOTAL_JOBS}" "/htclogs/${SLURM_JOB_NAME}.slurm.sub" +post_exit=$? +set -e + +# final exit decision +if [[ "${any_fail}" -ne 0 ]]; then + echo "One or more simulation tasks failed; exiting non-zero" | tee -a "${LOG_FILE}" + exit 1 +fi + +if [[ "${post_exit}" -eq 124 ]]; then + echo "JavaPostprocessor64 timed out after 20 seconds; exiting with code 124" | tee -a "${LOG_FILE}" + exit 124 +elif [[ "${post_exit}" -ne 0 ]]; then + echo "JavaPostprocessor64 failed with exit code ${post_exit}; exiting ${post_exit}" | tee -a "${LOG_FILE}" + exit "${post_exit}" +fi + +echo "=== [$(date)] === SLURM job completed successfully" | tee -a "${LOG_FILE}" +exit 0 \ No newline at end of file diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/htc/slurm/SlurmProxyTest.java b/vcell-server/src/test/java/cbit/vcell/message/server/htc/slurm/SlurmProxyTest.java index dda5334a7a..3039d64616 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/htc/slurm/SlurmProxyTest.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/htc/slurm/SlurmProxyTest.java @@ -4,27 +4,33 @@ import cbit.vcell.parser.ExpressionException; import cbit.vcell.resource.PropertyLoader; import cbit.vcell.simdata.PortableCommand; +import cbit.vcell.solver.SolverTaskDescription; import cbit.vcell.solvers.ExecutableCommand; import cbit.vcell.xml.XmlHelper; import cbit.vcell.xml.XmlParseException; import org.junit.jupiter.api.*; +import org.junit.jupiter.api.condition.DisabledOnOs; +import org.junit.jupiter.api.condition.OS; import org.vcell.util.document.KeyValue; import org.vcell.util.document.User; import java.io.*; import java.util.ArrayList; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.stream.Collectors; +import static cbit.vcell.message.server.htc.HtcProxy.toUnixStyleText; + @Tag("Fast") public class SlurmProxyTest { - private final HashMap originalProperties = new HashMap<>(); + private final HashMap originalProperties = new LinkedHashMap<>(); private void setProperty(String key, String value) { - originalProperties.put(key, System.getProperty(key)); + originalProperties.putIfAbsent(key, System.getProperty(key)); System.setProperty(key, value); } @@ -92,11 +98,20 @@ public void teardown() { public String createScriptForNativeSolvers(String simTaskResourcePath, String[] command, String JOB_NAME) throws IOException, XmlParseException, ExpressionException { + String os = System.getProperty("os.name").toLowerCase(); + boolean isWindows = os.startsWith("windows"); + SimulationTask simTask = XmlHelper.XMLToSimTask(readTextFileFromResource(simTaskResourcePath)); KeyValue simKey = simTask.getSimKey(); SlurmProxy slurmProxy = new SlurmProxy(null, "vcell"); File subFileExternal = new File("/share/apps/vcell3/htclogs/V_REL_"+simKey+"_0_0.slurm.sub"); + String subFileExternalPath = subFileExternal.getAbsolutePath(); + if(isWindows) { + subFileExternalPath = subFileExternalPath + .replaceAll("[A-Za-z]:", "") // remove drive letter, keep the leading backslash + .replace("\\", "/"); // normalize separators + } User simOwner = simTask.getSimulation().getVersion().getOwner(); final int jobId = simTask.getSimulationJob().getJobIndex(); @@ -104,10 +119,17 @@ public String createScriptForNativeSolvers(String simTaskResourcePath, String[] // preprocessor String simTaskFilePathExternal = "/share/apps/vcell3/users/schaff/SimID_"+simKey+"_0__0.simtask.xml"; File primaryUserDirExternal = new File("/share/apps/vcell3/users/schaff"); + String primaryUserDirExternalPath = primaryUserDirExternal.getAbsolutePath(); + if(isWindows) { + primaryUserDirExternalPath = primaryUserDirExternalPath + .replaceAll("[A-Za-z]:", "") + .replace("\\", "/"); + } + List args = new ArrayList<>( 4 ); args.add( PropertyLoader.getRequiredProperty(PropertyLoader.simulationPreprocessor) ); args.add( simTaskFilePathExternal ); - args.add( primaryUserDirExternal.getAbsolutePath() ); + args.add( primaryUserDirExternalPath ); ExecutableCommand preprocessorCmd = new ExecutableCommand(null, false, false,args); ExecutableCommand.LibraryPath libraryPath = new ExecutableCommand.LibraryPath("/usr/local/app/localsolvers/linux64"); @@ -123,7 +145,7 @@ public String createScriptForNativeSolvers(String simTaskResourcePath, String[] Integer.toString(jobId), Integer.toString(simTask.getTaskID()), SOLVER_EXIT_CODE_REPLACE_STRING, - subFileExternal.getAbsolutePath()); + subFileExternalPath); postprocessorCmd.setExitCodeToken(SOLVER_EXIT_CODE_REPLACE_STRING); ExecutableCommand.Container commandSet = new ExecutableCommand.Container(); @@ -134,9 +156,19 @@ public String createScriptForNativeSolvers(String simTaskResourcePath, String[] int NUM_CPUs = 1; int MEM_SIZE_MB = 1000; ArrayList postProcessingCommands = new ArrayList<>(); - return slurmProxy.createJobScriptText(JOB_NAME, commandSet, NUM_CPUs, MEM_SIZE_MB, postProcessingCommands, simTask); + + SolverTaskDescription std = simTask.getSimulationJob().getSimulation().getSolverTaskDescription(); + String scriptText; + if(std.getSolverDescription().isLangevinSolver() && std.getLangevinSimulationOptions().getTotalNumberOfJobs() > 1) { + scriptText = slurmProxy.createBatchJobScriptText(JOB_NAME, commandSet, NUM_CPUs, MEM_SIZE_MB, postProcessingCommands, simTask); + } else { + scriptText = slurmProxy.createJobScriptText(JOB_NAME, commandSet, NUM_CPUs, MEM_SIZE_MB, postProcessingCommands, simTask); + } + return scriptText; } + + public String createScriptForJavaSolvers(String simTaskResourcePath, String JOB_NAME) throws IOException, XmlParseException, ExpressionException { SimulationTask simTask = XmlHelper.XMLToSimTask(readTextFileFromResource(simTaskResourcePath)); @@ -237,6 +269,7 @@ public void testSimJobScriptCVODE() throws IOException, XmlParseException, Expre Assertions.assertEquals(expectedSlurmScript.trim(), slurmScript.trim()); } + @DisabledOnOs(OS.WINDOWS) @Test public void testSimJobScriptLangevin() throws IOException, XmlParseException, ExpressionException { String simTaskResourcePath = "slurm_fixtures/langevin/SimID_274672135_0__0.simtask.xml"; @@ -254,6 +287,34 @@ public void testSimJobScriptLangevin() throws IOException, XmlParseException, Ex Assertions.assertEquals(expectedSlurmScript.trim(), slurmScript.trim()); } + @Test + public void testSimJobScriptLangevinBatch() throws IOException, XmlParseException, ExpressionException { + setProperty(PropertyLoader.htc_vcellopt_docker_name, "ghcr.io/virtualcell/vcell-opt:7.7.0.39"); + setProperty(PropertyLoader.vcellSoftwareVersion, "Rel_Version_7.7.0_build_39"); + setProperty(PropertyLoader.vcellServerIDProperty,"TEST2"); + setProperty(PropertyLoader.jmsSimHostExternal, "k8s-wn-01.cam.uchc.edu"); + setProperty(PropertyLoader.htc_vcellbatch_docker_name, "ghcr.io/virtualcell/vcell-batch:7.7.0.39"); + + String simTaskResourcePath = "slurm_fixtures/langevin/SimID_999999999_0__0.simtask.xml"; + String JOB_NAME = "V_TEST2_999999999_0_0"; + + String executable = "/usr/local/app/localsolvers/linux64/langevin_x64"; + String outputLog = "/share/apps/vcell3/users/danv/SimID_999999999_0_.log"; + String messagingConfig = "/share/apps/vcell3/users/danv/SimID_999999999_0_.langevinMessagingConfig"; + String inputFilePath = "/share/apps/vcell3/users/danv/SimID_999999999_0_.langevinInput"; + String[] command = new String[] { executable, "simulate", "--output-log="+outputLog, + "--vc-send-status-config="+messagingConfig, inputFilePath, "0", "-tid", "0" }; + + String actualSlurmScript = createScriptForNativeSolvers(simTaskResourcePath, command, JOB_NAME); + actualSlurmScript = toUnixStyleText(actualSlurmScript); + + // this is the source of truth + String expectedSlurmScript = readTextFileFromResource("slurm_fixtures/langevin/V_TEST2_999999999_0_0.slurm.sub"); + expectedSlurmScript = toUnixStyleText(expectedSlurmScript); + + Assertions.assertEquals(expectedSlurmScript.trim(), actualSlurmScript.trim(), "Strings should be equal"); + } + @Test public void testSimJobScriptNFsim() throws IOException, XmlParseException, ExpressionException { String simTaskResourcePath = "slurm_fixtures/nfsim/SimID_274642453_0__0.simtask.xml"; diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/SimID_888888888_0_.langevinMessagingConfig b/vcell-server/src/test/resources/slurm_fixtures/langevin/SimID_888888888_0_.langevinMessagingConfig new file mode 100644 index 0000000000..a4e36b4139 --- /dev/null +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/SimID_888888888_0_.langevinMessagingConfig @@ -0,0 +1,8 @@ +broker_host=k8s-wn-01.cam.uchc.edu +broker_port=30162 +broker_username=admin +broker_password=admin +vc_username=temp +simKey=888888888 +taskID=0 +jobIndex=0 diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/SimID_999999999_0__0.simtask.xml b/vcell-server/src/test/resources/slurm_fixtures/langevin/SimID_999999999_0__0.simtask.xml new file mode 100644 index 0000000000..713cc565f6 --- /dev/null +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/SimID_999999999_0__0.simtask.xml @@ -0,0 +1,190 @@ + + + + + + + + + + + + + 96485.3321 + 9.64853321E-5 + 6.02214179E11 + 3.141592653589793 + 8314.46261815 + 300.0 + 10.0 + 0.001660538783162726 + 0.0 + 30.0 + 1.0499999999999999E-4 + 8.949999999999999E-4 + 0.009999999999999827 + (1.0 * pow(KMOLE,1.0)) + 0.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ((MT0_Count * UnitFactor_uM_um3_molecules_neg_1) / Size_Intracellular) + ((O0_0_Count * UnitFactor_uM_um3_molecules_neg_1) / Size_Intracellular) + ((O0_1_Count * UnitFactor_uM_um3_molecules_neg_1) / Size_Intracellular) + Kf + Kr + + + + + + + + + + + P_binding_probabilityRate + 1.0 + 1.0 + 1.0 + + + + P_binding_reverse_probabilityRate + 1.0 + 1.0 + 1.0 + + + + MT0_Count_initCount + 0.0 + 0.0 + 0.0 + + 0.0 + 0.0 + 0.0 + 0.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1.0E-9 + 1.0E-4 + 10 + 10 + 10 + 8 + 3 + + 1 + + + + + + + + + + + + + + + (z < 0.09) + + + 1.0 + + + + + + + + + + + + + \ No newline at end of file diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/V_TEST2_888888888_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/langevin/V_TEST2_888888888_0_0.slurm.sub new file mode 100644 index 0000000000..8123f77ab6 --- /dev/null +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/V_TEST2_888888888_0_0.slurm.sub @@ -0,0 +1,170 @@ +#!/usr/bin/bash +#SBATCH --partition=vcell +#SBATCH --reservation= +#SBATCH --qos=vcell +#SBATCH -J V_TEST2_888888888_0_0 +#SBATCH -o /share/apps/vcell3/htclogs/V_TEST2_888888888_0_.slurm.log +#SBATCH -e /share/apps/vcell3/htclogs/V_TEST2_888888888_0_.slurm.log +#SBATCH --ntasks=3 # number of concurrent tasks +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=4096M +#SBATCH --nodes=1 +#SBATCH --time=00:33:00 # timeout for the entire job +#SBATCH --no-kill +#SBATCH --no-requeue + +set -o errexit +set -o pipefail +set -o nounset +set +e + +# Script-controlled variables (populated by generator in real use) +USERID=danv +SIM_ID=888888888 +TOTAL_JOBS=8 # to be set by generator to lso.getTotalNumberOfJobs() +JOB_TIMEOUT_SECONDS=300 # per-job timeout (seconds), adjust per generator + +# Truncate / delete various logs and the solver input file, to start clean +: > /share/apps/vcell3/htclogs/V_TEST2_${SIM_ID}_0_.slurm.log + +echo "=== Singularity check BEFORE module load ===" +if command -v singularity >/dev/null 2>&1; then + echo "Singularity found at: $(command -v singularity)" + singularity --version +else + echo "Singularity not found before module load" +fi + +TMPDIR=/scratch/vcell +if [ ! -e $TMPDIR ]; then mkdir -p $TMPDIR ; fi +echo `hostname` +export MODULEPATH=/isg/shared/modulefiles:/tgcapps/modulefiles +if [ -f /usr/share/modules/init/bash ]; then + source /usr/share/modules/init/bash + module load singularity/vcell-3.10.0 +else + echo "[Warning] Module init script not found - skipping module setup" +fi +export SINGULARITY_CACHEDIR=/share/apps/vcell3/singularity/cachdir +export SINGULARITY_PULLFOLDER=/share/apps/vcell3/singularity/pullfolder + +echo "=== Singularity check AFTER module load ===" +if command -v singularity >/dev/null 2>&1; then + echo "Singularity found at: $(command -v singularity)" + singularity --version +else + echo "Singularity not found after module load" + exit 127 +fi + +# Compute memory per task and per job +MEM_TASK=$(( SLURM_MEM_PER_CPU * SLURM_CPUS_PER_TASK )) +MEM_JOB=$(( MEM_TASK * SLURM_NTASKS )) + +echo "======= SLURM job started =======" +echo "Hostname : $(hostname -f)" +echo "User : $USERID" +echo "Sim ID : $SIM_ID" +echo "id : $(id)" +echo "Total Jobs : $TOTAL_JOBS" +echo "Job Timeout : $JOB_TIMEOUT_SECONDS" +echo "Slurm Job ID : $SLURM_JOB_ID" +echo "Slurm Job Name : $SLURM_JOB_NAME" +echo "Start Time : $(date)" +echo "Working Dir : $(pwd)" +echo "Node List : $SLURM_NODELIST" +echo "CPUs per task : $SLURM_CPUS_PER_TASK" +echo "Mem. per task : ${MEM_TASK} MB total" +echo "Mem. per job : ${MEM_JOB} MB total" +echo "Environment snapshot:" +env +echo "=================================" + +container_bindings="--bind /share/apps/vcell3/users:/simdata " +container_bindings+="--bind /share/apps/vcell7/users:/simdata_secondary " +container_bindings+="--bind /share/apps/vcell12/users:/share/apps/vcell12/users " +container_bindings+="--bind /share/apps/vcell3/htclogs:/htclogs " +container_bindings+="--bind /scratch/vcell:/solvertmp " + +container_env="--env java_mem_Xmx=3600M " +container_env+="--env jmshost_sim_internal=rke-wn-01.cam.uchc.edu " +container_env+="--env jmsport_sim_internal=31618 " +container_env+="--env jmsrestport_sim_internal=30163 " +container_env+="--env jmsuser=clientUser " +container_env+="--env jmspswd=dummy " +container_env+="--env jmsblob_minsize=100000 " +container_env+="--env mongodbhost_internal=rke-wn-01.cam.uchc.edu " +container_env+="--env mongodbport_internal=30019 " +container_env+="--env mongodb_database=test " +container_env+="--env primary_datadir_external=/share/apps/vcell3/users " +container_env+="--env secondary_datadir_external=/share/apps/vcell7/users " +container_env+="--env htclogdir_external=/share/apps/vcell3/htclogs " +container_env+="--env softwareVersion=Rel_Version_7.7.0_build_34 " +container_env+="--env serverid=TEST2 " + +# Container image path +sif_path="/share/apps/vcell3/singularity/cachdir/sif/vcell-batch-7.7.0.34.sif" +# Full solver command +batch_container_prefix="singularity run --containall ${container_bindings} ${container_env} ${sif_path}" +solver_container_prefix="singularity run --containall ${container_bindings} ${container_env} ${sif_path}" +slurm_prefix="srun -N1 -n1 -c${SLURM_CPUS_PER_TASK}" + +echo "Job Execution Log" +echo "------------------" +echo "=== [$(date)] === SLURM job ${SLURM_JOB_ID} started on $(hostname)" + +INPUT_DIR="/simdata/${USERID}" +CONFIG_FILE="/share/apps/vcell3/users/${USERID}/SimID_${SIM_ID}_0_.langevinMessagingConfig" + +echo "Parse config file once" +broker_host=""; broker_port=""; broker_username=""; broker_password="" +vc_username=""; simKey=""; taskID=""; jobIndex="" +while IFS="=" read -r key value; do + case "$key" in + broker_host) broker_host="$value" ;; + broker_port) broker_port="$value" ;; + broker_username) broker_username="$value" ;; + broker_password) broker_password="$value" ;; + vc_username) vc_username="$value" ;; + simKey) simKey="$value" ;; + taskID) taskID="$value" ;; + jobIndex) jobIndex="$value" ;; + esac +done < "$CONFIG_FILE" +echo "Parsed configuration:" +echo " broker_host = $broker_host" +echo " broker_port = $broker_port" +echo " broker_username = $broker_username" +echo " broker_password = $broker_password" +echo " vc_username = $vc_username" +echo " simKey = $simKey" +echo " taskID = $taskID" +echo " jobIndex = $jobIndex" + +statusCode=1001 +statusMsg="Running" +# Immutable part, built once +BASE_PROPERTIES="JMSDeliveryMode=persistent&JMSTimeToLive=3000" +BASE_PROPERTIES+="&SimKey=${simKey}" +BASE_PROPERTIES+="&JobIndex=${jobIndex}" +BASE_PROPERTIES+="&TaskID=${taskID}" +BASE_PROPERTIES+="&UserName=${vc_username}" +BASE_PROPERTIES+="&MessageType=WorkerEvent" +BASE_PROPERTIES+="&WorkerEvent_Status=${statusCode}" +BASE_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" +BASE_PROPERTIES+="&HostName=$(hostname)" + +# -------------------------------------------------------------------------------- + +# Build only the execution command (after each task in the production code) +timepoint=$(date +%s) +progress=0.5 +PROPERTIES="${BASE_PROPERTIES}&WorkerEvent_TimePoint=${timepoint}&WorkerEvent_Progress=${progress}" + +msgCommand="set -o errexit; set -o pipefail; set -o nounset" +msgCommand+=" +curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" +${solver_container_prefix} /bin/bash -c "$msgCommand" # Execute inside singularity + +echo "=== [$(date)] === MESSAGING job completed successfully" +exit 0 \ No newline at end of file diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/V_TEST2_999999999_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/langevin/V_TEST2_999999999_0_0.slurm.sub new file mode 100644 index 0000000000..c7f3810ad5 --- /dev/null +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/V_TEST2_999999999_0_0.slurm.sub @@ -0,0 +1,345 @@ +#!/usr/bin/bash +#SBATCH --partition=vcell +#SBATCH --reservation= +#SBATCH --qos=vcell +#SBATCH -J V_TEST2_999999999_0_0 +#SBATCH -o /share/apps/vcell3/htclogs/V_TEST2_999999999_0_.slurm.log +#SBATCH -e /share/apps/vcell3/htclogs/V_TEST2_999999999_0_.slurm.log +#SBATCH --ntasks=3 # number of concurrent tasks +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=4096M +#SBATCH --nodes=1 +#SBATCH --time=52:49:00 # timeout for the entire job +#SBATCH --no-kill +#SBATCH --no-requeue + +set -o errexit +set -o pipefail +set -o nounset +set +e + +# Script-controlled variables (populated by generator in real use) +USERID=danv +SIM_ID=999999999 +TOTAL_JOBS=8 # to be set by generator to lso.getTotalNumberOfJobs() +JOB_TIMEOUT_SECONDS=28800 # per-job timeout (seconds), adjust per generator +LOG_FILE="/share/apps/vcell3/htclogs/V_TEST2_999999999_0_.submit.log" +MESSAGING_CONFIG_FILE="/share/apps/vcell3/users/danv/SimID_999999999_0_.langevinMessagingConfig" + +# Truncate / delete various logs and the solver input file, to start clean +: > /share/apps/vcell3/htclogs/V_TEST2_${SIM_ID}_0_.slurm.log +rm -f /share/apps/vcell3/users/${USERID}/SimID_${SIM_ID}_0_*.log +rm -f /share/apps/vcell3/users/${USERID}/SimID_${SIM_ID}_0__*.ida +rm -f /share/apps/vcell3/users/${USERID}/SimID_${SIM_ID}_0__*.json +rm -f /share/apps/vcell3/users/${USERID}/SimID_${SIM_ID}_0_.functions +rm -f /share/apps/vcell3/users/${USERID}/SimID_${SIM_ID}_0_.langevinInput +rm -f /share/apps/vcell3/users/${USERID}/SimID_${SIM_ID}_0_.langevinMessagingConfig + +echo "=== Singularity check BEFORE module load ===" +if command -v singularity >/dev/null 2>&1; then + echo "Singularity found at: $(command -v singularity)" + singularity --version +else + echo "Singularity not found before module load" +fi + +TMPDIR=/scratch/vcell +if [ ! -e $TMPDIR ]; then mkdir -p $TMPDIR ; fi +echo `hostname` +export MODULEPATH=/isg/shared/modulefiles:/tgcapps/modulefiles +if [ -f /usr/share/modules/init/bash ]; then + source /usr/share/modules/init/bash + module load singularity/vcell-3.10.0 +else + echo "[Warning] Module init script not found - skipping module setup" +fi +export SINGULARITY_CACHEDIR=/share/apps/vcell3/singularity/cachdir +export SINGULARITY_PULLFOLDER=/share/apps/vcell3/singularity/pullfolder + +echo "=== Singularity check AFTER module load ===" +if command -v singularity >/dev/null 2>&1; then + echo "Singularity found at: $(command -v singularity)" + singularity --version +else + echo "Singularity not found after module load" + exit 127 +fi + +# Compute memory per task and per job +MEM_TASK=$(( SLURM_MEM_PER_CPU * SLURM_CPUS_PER_TASK )) +MEM_JOB=$(( MEM_TASK * SLURM_NTASKS )) + +echo "======= SLURM job started =======" +echo "Hostname : $(hostname -f)" +echo "User : $USERID" +echo "Sim ID : $SIM_ID" +echo "id : $(id)" +echo "Total Jobs : $TOTAL_JOBS" +echo "Job Timeout : $JOB_TIMEOUT_SECONDS" +echo "Slurm Job ID : $SLURM_JOB_ID" +echo "Slurm Job Name : $SLURM_JOB_NAME" +echo "Start Time : $(date)" +echo "Working Dir : $(pwd)" +echo "Node List : $SLURM_NODELIST" +echo "CPUs per task : $SLURM_CPUS_PER_TASK" +echo "Mem. per task : ${MEM_TASK} MB total" +echo "Mem. per job : ${MEM_JOB} MB total" +echo "Environment snapshot:" +env +echo "=================================" + +container_bindings="--bind /share/apps/vcell3/users:/simdata " +container_bindings+="--bind /share/apps/vcell7/users:/simdata_secondary " +container_bindings+="--bind /share/apps/vcell12/users:/share/apps/vcell12/users " +container_bindings+="--bind /share/apps/vcell3/htclogs:/htclogs " +container_bindings+="--bind /scratch/vcell:/solvertmp " + +container_env="--env java_mem_Xmx=3600M " +container_env+="--env jmshost_sim_internal=k8s-wn-01.cam.uchc.edu " +container_env+="--env jmsport_sim_internal=31618 " +container_env+="--env jmsrestport_sim_internal=30163 " +container_env+="--env jmsuser=clientUser " +container_env+="--env jmspswd=dummy " +container_env+="--env jmsblob_minsize=100000 " +container_env+="--env mongodbhost_internal=rke-wn-01.cam.uchc.edu " +container_env+="--env mongodbport_internal=30019 " +container_env+="--env mongodb_database=test " +container_env+="--env primary_datadir_external=/share/apps/vcell3/users " +container_env+="--env secondary_datadir_external=/share/apps/vcell7/users " +container_env+="--env htclogdir_external=/share/apps/vcell3/htclogs " +container_env+="--env softwareVersion=Rel_Version_7.7.0_build_39 " +container_env+="--env serverid=TEST2 " + +# Full solver command +solver_docker_name=ghcr.io/virtualcell/vcell-batch:7.7.0.39 +solver_container_prefix="singularity run --containall ${container_bindings} ${container_env} docker://${solver_docker_name}" +batch_docker_name=ghcr.io/virtualcell/vcell-batch:7.7.0.39 +batch_container_prefix="singularity run --containall ${container_bindings} ${container_env} docker://${batch_docker_name}" +slurm_prefix="srun -N1 -n1 -c${SLURM_CPUS_PER_TASK}" + +# === From here down, the code is identical for all jobs and should be copied from a resource file === + +echo "Job Execution Log" > "${LOG_FILE}" +echo "------------------" >> "${LOG_FILE}" +echo "=== [$(date)] === SLURM job ${SLURM_JOB_ID} started on $(hostname)" | tee -a "${LOG_FILE}" + +SIMXML="/simdata/${USERID}/SimID_${SIM_ID}_0__0.simtask.xml" # preprocessor XML path (per-user) +echo "[Preprocess] Running JavaPreprocessor64 ${SIMXML}" | tee -a "${LOG_FILE}" + +# run JavaPreprocessor64 inside batch container (use generated batch prefix) +${batch_container_prefix} JavaPreprocessor64 "${SIMXML}" "/simdata/${USERID}" +stat=$? +echo "JavaPreprocessor64 returned ${stat}" | tee -a "${LOG_FILE}" +if [ $stat -ne 0 ]; then + ${batch_container_prefix} JavaPostprocessor64 "${SIM_ID}" "${USERID}" 17 0 0 $stat "/htclogs/${SLURM_JOB_NAME}.slurm.sub" || true + echo "Preprocessor failed; exiting ${stat}" | tee -a "${LOG_FILE}" + exit $stat +fi + +# ---------------------------------------------------------------------------------------------- + +echo "Parse the messaging config file" +broker_host=""; broker_port=""; broker_username=""; broker_password="" +vc_username=""; simKey=""; taskID=""; jobIndex="" +while IFS="=" read -r key value; do + case "$key" in + broker_host) broker_host="$value" ;; + broker_port) broker_port="$value" ;; + broker_username) broker_username="$value" ;; + broker_password) broker_password="$value" ;; + vc_username) vc_username="$value" ;; + simKey) simKey="$value" ;; + taskID) taskID="$value" ;; + jobIndex) jobIndex="$value" ;; + esac +done < "$MESSAGING_CONFIG_FILE" + +echo "Parsed configuration:" +echo " broker_host = $broker_host" +echo " broker_port = $broker_port" +echo " broker_username = $broker_username" +echo " broker_password = $broker_password" +echo " vc_username = $vc_username" +echo " simKey = $simKey" +echo " taskID = $taskID" +echo " jobIndex = $jobIndex" + +statusCode=1001 +statusMsg="Running" +# Immutable part, built once +BASE_PROPERTIES="JMSDeliveryMode=persistent&JMSTimeToLive=3000" +BASE_PROPERTIES+="&SimKey=${simKey}" +BASE_PROPERTIES+="&JobIndex=${jobIndex}" +BASE_PROPERTIES+="&TaskID=${taskID}" +BASE_PROPERTIES+="&UserName=${vc_username}" +BASE_PROPERTIES+="&MessageType=WorkerEvent" +BASE_PROPERTIES+="&HostName=$(hostname)" + +INPUT_DIR="/simdata/${USERID}" # on singularity, each solver instance reads inputs from here +LOG_DIR="/simdata/${USERID}" # on singularity, each solver instance writes logs here +declare -A job_pid_map +job_pids=() +running_jobs=0 +any_fail=0 +max_concurrent_jobs=${SLURM_NTASKS:-4} # actually these are tasks +echo "max_concurrent_jobs: ${max_concurrent_jobs}" >> "${LOG_FILE}" +echo "TOTAL_JOBS to launch: ${TOTAL_JOBS}" >> "${LOG_FILE}" + +finished_jobs=0 # messaging counters +last_notify_time=0 +MIN_NOTIFY_INTERVAL=10 # seconds + +for ((i = 0; i < TOTAL_JOBS; i++)); do + echo "Task $i starting at $(date)" >> "${LOG_FILE}" # log task start + + ( # run each task in parallel + timeout "${JOB_TIMEOUT_SECONDS}s" \ + ${slurm_prefix} ${solver_container_prefix} \ + langevin_x64 simulate \ + --output-log="${LOG_DIR}/SimID_${SIM_ID}_0_${i}.log" \ + "${INPUT_DIR}/SimID_${SIM_ID}_0_.langevinInput" \ + "$i" \ + -tid 0 + ) & + pid=$! # capture the task PID + echo "PID is $pid" >> "${LOG_FILE}" + job_pids+=($pid) # store the PID + job_pid_map[$pid]=$i # map task index to PID + ((running_jobs++)) # increment running task count + echo "currently running jobs: ${running_jobs}" >> "${LOG_FILE}" + + # wait for a finished task before launching a new one if we hit the concurrency limit + while (( running_jobs >= max_concurrent_jobs )); do + for idx in "${!job_pids[@]}"; do + pid="${job_pids[$idx]}" + if [[ -z "${pid}" ]]; then continue; fi + if ! kill -0 "$pid" 2>/dev/null; then # check if process is still running + wait "$pid" # + exit_code=$? + job_index=${job_pid_map[$pid]:-unknown} # retrieve original task index + echo "Task $job_index with pid ${pid} finished with exit code $exit_code at $(date)" >> "${LOG_FILE}" + unset "job_pids[$idx]" # remove PID from active PID list + unset "job_pid_map[$pid]" # remove mapping + ((running_jobs--)) + ((finished_jobs++)) + progress=$(awk "BEGIN {print ${finished_jobs}/(${TOTAL_JOBS}+1)}") # compute progress + timepoint=$(date +%s) + if (( timepoint - last_notify_time >= MIN_NOTIFY_INTERVAL )); then + last_notify_time=$timepoint + statusCode=1001 + statusMsg="Running" + RUNTIME_PROPERTIES="&WorkerEvent_Status=${statusCode}" + RUNTIME_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" + RUNTIME_PROPERTIES+="&WorkerEvent_TimePoint=${timepoint}" + RUNTIME_PROPERTIES+="&WorkerEvent_Progress=${progress}" + PROPERTIES="${BASE_PROPERTIES}${RUNTIME_PROPERTIES}" + msgCommand="set -o errexit; set -o pipefail; set -o nounset" + msgCommand+=" + curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" + ${solver_container_prefix} /bin/bash -c "$msgCommand" # execute inside singularity + echo "progress notification sent, ${progress} done" >> "${LOG_FILE}" + fi + if [[ $exit_code -ne 0 ]]; then any_fail=1; fi + break # exit for-loop once we free up a slot + fi + done + sleep 1 # allow brief pause before rechecking + done +done + +# Final wait for remaining tasks +for pid in "${job_pids[@]}"; do + if [[ -z "${pid}" ]]; then continue; fi + wait "$pid" + exit_code=$? + job_index=${job_pid_map[$pid]:-unknown} + echo "Task $job_index finished with exit code $exit_code at $(date)" >> "${LOG_FILE}" + ((finished_jobs++)) + progress=$(awk "BEGIN {print ${finished_jobs}/(${TOTAL_JOBS}+1)}") # compute progress + timepoint=$(date +%s) + if (( timepoint - last_notify_time >= MIN_NOTIFY_INTERVAL )); then + last_notify_time=$timepoint + statusCode=1001 + statusMsg="Running" + RUNTIME_PROPERTIES="&WorkerEvent_Status=${statusCode}" + RUNTIME_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" + RUNTIME_PROPERTIES+="&WorkerEvent_TimePoint=${timepoint}" + RUNTIME_PROPERTIES+="&WorkerEvent_Progress=${progress}" + PROPERTIES="${BASE_PROPERTIES}${RUNTIME_PROPERTIES}" + msgCommand="set -o errexit; set -o pipefail; set -o nounset" + msgCommand+=" + curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" + ${solver_container_prefix} /bin/bash -c "$msgCommand" # execute inside singularity + echo "progress notification sent, ${progress} done" >> "${LOG_FILE}" + fi + if [[ $exit_code -ne 0 ]]; then any_fail=1; fi +done +echo "Batch job completed at $(date)" >> "${LOG_FILE}" + +# ---------------------------------------------------------------------------------------------- +# postprocess solver invocation (runs after all simulations finish) +echo "Starting the last job (postprocess) at $(date)" >> "${LOG_FILE}" +timeout "${JOB_TIMEOUT_SECONDS}s" \ + ${slurm_prefix} ${solver_container_prefix} \ + langevin_x64 postprocess \ + "${INPUT_DIR}/SimID_${SIM_ID}_0_.langevinInput" \ + ${TOTAL_JOBS} \ + --output-log="${LOG_DIR}/SimID_${SIM_ID}_0_P.log" \ + --vc-print-status & + +last_pid=$! +wait $last_pid +exit_code=$? +echo "Task 'Last' with PID $last_pid finished with exit code $exit_code at $(date)" >> "${LOG_FILE}" +timepoint=$(date +%s) +progress="1.0" +statusCode=1001 +statusMsg="Running" +RUNTIME_PROPERTIES="&WorkerEvent_Status=${statusCode}" +RUNTIME_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" +RUNTIME_PROPERTIES+="&WorkerEvent_TimePoint=${timepoint}" +RUNTIME_PROPERTIES+="&WorkerEvent_Progress=${progress}" +PROPERTIES="${BASE_PROPERTIES}${RUNTIME_PROPERTIES}" +msgCommand="set -o errexit; set -o pipefail; set -o nounset" +msgCommand+=" +curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" +${solver_container_prefix} /bin/bash -c "$msgCommand" # execute inside singularity +echo "progress notification sent, ${progress} done" >> "${LOG_FILE}" +statusCode=1003 +statusMsg="Finished" +RUNTIME_PROPERTIES="&WorkerEvent_Status=${statusCode}" +RUNTIME_PROPERTIES+="&WorkerEvent_StatusMsg=${statusMsg}" +RUNTIME_PROPERTIES+="&WorkerEvent_TimePoint=${timepoint}" +RUNTIME_PROPERTIES+="&WorkerEvent_Progress=${progress}" +PROPERTIES="${BASE_PROPERTIES}${RUNTIME_PROPERTIES}" +msgCommand="set -o errexit; set -o pipefail; set -o nounset" +msgCommand+=" +curl -v -XPOST \"http://${broker_username}:${broker_password}@${broker_host}:${broker_port}/api/message/workerEvent?type=queue&${PROPERTIES}\"" +${solver_container_prefix} /bin/bash -c "$msgCommand" # execute inside singularity +echo "FINISHED notification sent" >> "${LOG_FILE}" +echo "The final task finished at $(date)" >> "${LOG_FILE}" +echo "All tasks completed at $(date)" >> "${LOG_FILE}" + +# ---------------------------------------------------------------------------------------------- +# run JavaPostprocessor64 inside batch container +echo "[Postprocess] Running JavaPostprocessor64..." | tee -a "${LOG_FILE}" +timeout 20s ${batch_container_prefix} JavaPostprocessor64 "${SIM_ID}" "${USERID}" 17 0 0 "${TOTAL_JOBS}" "/htclogs/${SLURM_JOB_NAME}.slurm.sub" +post_exit=$? +set -e + +# final exit decision +if [[ "${any_fail}" -ne 0 ]]; then + echo "One or more simulation tasks failed; exiting non-zero" | tee -a "${LOG_FILE}" + exit 1 +fi + +if [[ "${post_exit}" -eq 124 ]]; then + echo "JavaPostprocessor64 timed out after 20 seconds; exiting with code 124" | tee -a "${LOG_FILE}" + exit 124 +elif [[ "${post_exit}" -ne 0 ]]; then + echo "JavaPostprocessor64 failed with exit code ${post_exit}; exiting ${post_exit}" | tee -a "${LOG_FILE}" + exit "${post_exit}" +fi + +echo "=== [$(date)] === SLURM job completed successfully" | tee -a "${LOG_FILE}" +exit 0 \ No newline at end of file diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/dynamic.langevinInput b/vcell-server/src/test/resources/slurm_fixtures/langevin/dynamic.langevinInput similarity index 100% rename from vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/dynamic.langevinInput rename to vcell-server/src/test/resources/slurm_fixtures/langevin/dynamic.langevinInput diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/populate_vcell_batch.sh b/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/populate_vcell_batch.sh new file mode 100644 index 0000000000..8e16f88761 --- /dev/null +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/populate_vcell_batch.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +echo "Cleaning up old subfolders inside vcell-batch-job..." + +# Explicitly remove individual folders +rm -rf ~/vcell-batch-job/input +rm -rf ~/vcell-batch-job/logs +rm -rf ~/vcell-batch-job/output +rm -rf ~/vcell-batch-job/scripts + +# Recreate each directory separately +echo "Recreating folders: input/, logs/, output/, scripts/" +mkdir -p ~/vcell-batch-job/input +mkdir -p ~/vcell-batch-job/logs +mkdir -p ~/vcell-batch-job/output +mkdir -p ~/vcell-batch-job/scripts + +echo "Project folders populated successfully." diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/readme.md b/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/readme.md new file mode 100644 index 0000000000..bc5ac1e07c --- /dev/null +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/readme.md @@ -0,0 +1,219 @@ +# vcell-batch-job + + +This project supports the execution of Virtual Cell (VCell) simulations inside a containerized environment using Apptainer (formerly Singularity) on a Linux system hosted via WSL on Windows 10. This setup does not support slurm. + +--- + +## 🖥️ Environment Setup + +### 1. Install WSL (Windows Subsystem for Linux) + +1. Open **PowerShell as Administrator**. +2. Run: + ```powershell + wsl --install -d Ubuntu + ``` + If you encounter issues: + ```powershell + dism.exe /online /enable-feature /featurename:Microsoft-Windows-Subsystem-Linux /all /norestart + dism.exe /online /enable-feature /featurename:VirtualMachinePlatform /all /norestart + wsl --update + wsl --install -d Ubuntu + ``` +3. Restart windows when prompted + +### 2. Launch Ubuntu + + Open the Ubuntu app from the Windows Start Menu. + + You’ll land in a Linux terminal prompt: + ```powershell + vasilescu@oci:~$ + ``` + +To maintain visibility between File Explorer and Ubuntu's /home/vasilescu, paste \ +this in the File Explorer address bar. +```powershell +\\wsl$\Ubuntu\home\vasilescu\vcell-batch-job +``` + + +### 3. Installing Appteiner + +1. Update packages and install dependencies +```powershell +sudo apt update && sudo apt install -y \ +build-essential wget git cryptsetup runc uidmap squashfs-tools \ +fuse-overlayfs libseccomp-dev pkg-config libglib2.0-dev libfuse2 +``` +2. Download and install Apptainer: +```powershell +export VERSION=1.3.0 +wget https://github.com/apptainer/apptainer/releases/download/v${VERSION}/apptainer_${VERSION}_amd64.deb +sudo apt install ./apptainer_${VERSION}_amd64.deb +``` +3. Verify installation + ```powershell + apptainer version + ``` + +### 4. Project directory structure + +```powershell +vcell-batch-job/ +├── archive/ # Archived or versioned Slurm scripts +├── container/ # Apptainer images (e.g., vcell-batch-7.7.0.27.sif) +├── input/ # Input files (e.g., .simtask.xml, .langevinInput) +├── logs/ # Logs generated by Slurm .slurm.sub scripts +├── output/ # Simulation outputs (e.g., solver logs) +├── scripts/ # Active Slurm submission scripts and helpers +``` +To scaffold this layout run: +```powershell +mkdir -p vcell-batch-job/{archive,container,input,logs,output,scripts} +cp vcell-batch_*.sif vcell-batch-job/container/ +``` + +### 5. Container pulling and versioning. + +I need to run apptainer from within the Ubuntu shell on the local machine +since it is not available on mantis. \ +Remember the special file directory is different than the one on Windows, paste +the following in the File Explorer address bar: +> \\wsl$\Ubuntu\home\vasilescu\vcell-batch-job + +Set environment variables for GitHub Container Registry access: +```powershell +export APPTAINER_DOCKER_USERNAME= +export APPTAINER_DOCKER_PASSWORD= +``` +You'll need a ghcr login token +```powershell +On my machine, they are stored in +Z:/.ssh/ghcr_login_token +``` + +Pull the desired container image: +```powershell +apptainer pull vcell-batch-7.7.0.34.sif docker://ghcr.io/virtualcell/vcell-batch:7.7.0.34 +``` + +> All container images are saved under **container/** using versioned filenames \ +> (e.g., vcell-batch-7.7.0.27.sif, vcell-batch-7.7.0.28.sif) + + +### 6 On a HPC node - using slurm +Login to a HPC node +```powershell +ssh vasilescu@login.hpc.cam.uchc.edu +ssh vasilescu@mantis-040 +``` + +We assume that everything is installed properly, like above. \ +Root directory: +> Z: ⇔ /home/FCAM/vasilescu +```powershell +BASE_DIR=$HOME/vcell-batch-job +mkdir -p $BASE_DIR/{input,output,logs,scripts,container} +``` + +Frequently used, from BASE_DIR +```powershell +sed -i 's/\r$//' ./scripts/submit_vcell_batch.slurm.sub +tail -f ./logs/submit_vcell_batch.log +singularity exec ./container/vcell-batch-7.7.0.28.sif java --version # java version +singularity shell ./container/vcell-batch-7.7.0.28.sif # enter singularity +``` +#### Initialize directory structure +```powershell +./populate_vcell_batch.sh +``` + +#### Running the solver inside the container +```powershell +singularity exec ./container/vcell-batch-7.7.0.34.sif langevin_x64 --version +``` + +#### Launching a slurm script +```powershell +sbatch ./scripts/submit_vcell_batch.slurm.sub +``` + +Github Repository - configuration stuff +```powershell +https://github.com/virtualcell/vcell-fluxcd/ +https://github.com/virtualcell/vcell-fluxcd/blob/main/kustomize/config/prod/ +submit.env # all definitions for slurm, docker, dirs +``` +Logs - on linux machines, ex mantis-040 +```powershell +/share/apps/vcell3/htclogs + +> less V_REL_292453752_0_0.slurm.sub # shows top of the script (the definitions) \ +> ls -ot V_REL* | head # shows a few of the most recent release sims +``` + +### 7 Data to use for development (in the TEMP/slurm-test-inputs directory) + +#### KEEP subdir +Complex springsalad model with a transition and a binding reaction \ +Duration about 10 minutes \ +Results of 8 manual runs, slightly different results due to stochasticity \ +Should be used to test the advanced statistics logic in the solver + +#### proof-of-concept subdir +slurm script to run X trials, Y at a time + +#### simdata-keep subdir +very fast springsalad model with one binding reaction \ + +its content is replicated in repository files: +> SimID_999999999_0_.langevinInput \ +> SimID_999999999_0__0.simtask.xml + +here we also have the full results of one run + +##### Important note about the xml file +variables of interest for multiple runs under slurm, in containers, are: +```powershell + + .... + 8 + 20 + +``` + +### 8 The real deal (running in production environment) + +#### important directories on mantis-040 +```powershell +/share/apps/vcell3/htclogs/ - slurm script and logs +/share/apps/vcell3/users/danv - user's home directory, xml input file, results +``` + +#### useful commands +```powershell +ssh vcell@mantis-040.cam.uchc.edu # login to mantis as vcell user +sed -i 's/\r$//' ./ # fix line endings (Windows -> Unix style) +ls -lt | head -20 # list most recent files +cat # view file content +tail -f /share/apps/vcell3/htclogs/ # monitor log file + +sbatch .slurm.sub # submit a job +squeue -j # check job status +squeue -u # check job queue +scancel # cancel a job +sacct -j # check job accounting info +tail -f /share/apps/vcell3/htclogs/ # monitor log file + +``` + +#### frequently used for testing +```powershell +ssh vcell@mantis-040.cam.uchc.edu +sed -i 's/\r$//' ./V_TEST2_999999999_0_0.slurm.sub +sbatch ./V_TEST2_999999999_0_0.slurm.sub +sacct -j 974291 --format=JobID,State,ExitCode,Elapsed,MaxRSS,ReqMem,NodeList + +``` \ No newline at end of file diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/submit_vcell_batch.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/submit_vcell_batch.slurm.sub new file mode 100644 index 0000000000..73c1b3276c --- /dev/null +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/submit_vcell_batch.slurm.sub @@ -0,0 +1,128 @@ +#!/bin/bash +#SBATCH --job-name=SimID_35189106_0_ +#SBATCH --nodes=1 +#SBATCH --output=/home/FCAM/vasilescu/vcell-batch-job/logs/submit_vcell_batch.log +##SBATCH -o %x.stdout # alternately, uncomment and use these 2 instead of --output above +##SBATCH -e %x.stderr +#SBATCH --ntasks=4 # how many tasks to run in parallel +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=8G # ntasks x cpus-per-task = 4 x 1 --> 4 x 8G = 32G +#SBATCH --time=01:00:00 # 1 hour timeout for the entire job, format D-HH:MM:SS +#SBATCH --partition=vcell +#SBATCH --qos=vcell + +# slurm built-in placeholders +# %j job ID +# %x job name +# %u user name + +# shell variables +BASE_DIR=/home/FCAM/vasilescu/vcell-batch-job +CONTAINER_IMAGE=$BASE_DIR/container/vcell-batch-7.7.0.34.sif +INPUT_DIR=$BASE_DIR/input +OUTPUT_DIR=$BASE_DIR/output +LOG_DIR=$BASE_DIR/logs +SIMXML="${INPUT_DIR}/${SLURM_JOB_NAME}_0.simtask.xml" + +log_file="${LOG_DIR}/${SLURM_JOB_NAME}_sub.log" # slurm script log +total_jobs=10 # total number of jobs to run +timeout_duration=300s # maximum allowed runtime for each job (5m may work too) +max_concurrent_jobs=$SLURM_NTASKS # number of jobs allowed at once + +# Clear the log file at the start +echo "Job Execution Log" > "${log_file}" +echo "------------------" >> "${log_file}" +echo "=== [$(date)] === SLURM job started on node $(hostname)" +echo "=== [$(date)] === SLURM job started on node $(hostname)" >> "${log_file}" +sleep 2 + +echo "[Preprocess] Running JavaPreprocessor64..." +echo "[Preprocess] Running JavaPreprocessor64..." >> "${log_file}" +singularity exec --cleanenv \ + --bind $INPUT_DIR:$INPUT_DIR \ + --bind $OUTPUT_DIR:$OUTPUT_DIR \ + $CONTAINER_IMAGE \ + JavaPreprocessor64 $SIMXML $OUTPUT_DIR + +# --------------------------------------------------------------- + +declare -A job_pid_map # associative array to store job index -> PID mapping +job_pids=() # list to track PIDs +running_jobs=0 + +for i in $(seq 0 $((total_jobs - 1))); do + # log job start + echo "Job $i started at $(date)" >> "${log_file}" + # run each script in parallel + timeout $timeout_duration srun -N 1 -n 1 -c 1 \ + singularity exec --cleanenv \ + --bind $INPUT_DIR:$INPUT_DIR \ + --bind $OUTPUT_DIR:$OUTPUT_DIR \ + $CONTAINER_IMAGE \ + langevin_x64 simulate \ + "${INPUT_DIR}/${SLURM_JOB_NAME}.langevinInput" \ + $i \ + --output-log="${LOG_DIR}/${SLURM_JOB_NAME}_${i}.log" \ + --vc-print-status & + pid=$! # capture the job PID + job_pids+=($pid) # store the PID + job_pid_map[$pid]=$i # map job index to PID + ((running_jobs++)) # increment running job count + + # wait for a finished job before launching a new one if we hit the concurrency limit + while (( running_jobs >= max_concurrent_jobs )); do + for idx in "${!job_pids[@]}"; do + pid="${job_pids[$idx]}" + if ! kill -0 "$pid" 2>/dev/null; then # check if process is still running + wait "$pid" # ensure exit status is collected + exit_code=$? + job_index=${job_pid_map[$pid]} # retrieve original job index + echo "Job $job_index with PID $pid finished with exit code $exit_code at $(date)" >> "${log_file}" + unset "job_pids[$idx]" # remove PID from list + unset "job_pid_map[$pid]" # remove mapping + ((running_jobs--)) # decrement count + break # break once we free up a slot + fi + done + sleep 1 # allow brief pause before rechecking + done +done + +# final wait for any remaining jobs +for pid in "${job_pids[@]}"; do + wait $pid + exit_code=$? + job_index=${job_pid_map[$pid]} # retrieve original job index + echo "Job $job_index with PID $pid finished with exit code $exit_code at $(date)" >> "${log_file}" +done +echo "Batch jobs completed at $(date)" >> "${log_file}" + +echo "Starting the last job at $(date)" >> "${log_file}" +timeout $timeout_duration srun -N 1 -n 1 -c 1 \ + singularity exec --cleanenv \ + --bind $INPUT_DIR:$INPUT_DIR \ + --bind $OUTPUT_DIR:$OUTPUT_DIR \ + $CONTAINER_IMAGE \ + langevin_x64 postprocess \ + "${INPUT_DIR}/${SLURM_JOB_NAME}.langevinInput" \ + $total_jobs \ + --output-log="${LOG_DIR}/${SLURM_JOB_NAME}_P.log" \ + --vc-print-status & +last_pid=$! +wait $last_pid # explicitly wait for last.sh to finish +exit_code=$? # capture the exit code of the job +echo "Job 'Last' with PID $last_pid finished with exit code $exit_code at $(date)" >> "${log_file}" +echo "The final job finished at $(date)" >> "${log_file}" + +echo "All jobs completed at $(date)" >> "${log_file}" + + +echo "[Postprocess] Running JavaPostprocessor64..." +echo "[Postprocess] Running JavaPostprocessor64..." >> "${log_file}" +singularity exec --cleanenv \ + --bind $OUTPUT_DIR:$OUTPUT_DIR \ + $CONTAINER_IMAGE \ + JavaPostprocessor64 35189106 vasilescu 17 0 0 10 $SLURM_JOB_SCRIPT + +echo "=== [$(date)] === SLURM job completed" +echo "=== [$(date)] === SLURM job completed" >> "${log_file}" diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/version-checker.sub b/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/version-checker.sub new file mode 100644 index 0000000000..200c541075 --- /dev/null +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/slurm_array_poc/version-checker.sub @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --job-name=VREL999999999 +#SBATCH --output=/home/FCAM/vasilescu/vcell-batch-job/logs/submit_vcell_batch.log +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --mem=8G +#SBATCH --time=01:00:00 +#SBATCH --partition=vcell +#SBATCH --qos=vcell + +BASE_DIR=/home/FCAM/vasilescu/vcell-batch-job +CONTAINER_IMAGE=$BASE_DIR/container/vcell-batch_7.7.0.30.sif +INPUT_DIR=$BASE_DIR/input +OUTPUT_DIR=$BASE_DIR/output +LOG_DIR=$BASE_DIR/logs +SIMXML=$INPUT_DIR/SimID_999999999_0__0.simtask.xml + +echo "=== [$(date)] === SLURM job started on node $(hostname)" +sleep 15 + +echo "[Preprocess] Running JavaPreprocessor64..." + +java --version + +echo "=== [$(date)] === SLURM job completed"