diff --git a/curio/start_scripts/curio-init.sh b/curio/start_scripts/curio-init.sh index 2e34d0a6..955d32d5 100755 --- a/curio/start_scripts/curio-init.sh +++ b/curio/start_scripts/curio-init.sh @@ -178,6 +178,13 @@ if [ ! -f "$CURIO_REPO_PATH/.init.pdp" ]; then echo "PDP service setup complete" fi +# Source .env.curio on every boot (not just first) so curio has contract addresses after restart +if [ -f "$CURIO_REPO_PATH/.env.curio" ]; then + echo "Loading contract addresses from .env.curio..." + set -a + source "${CURIO_REPO_PATH}/.env.curio" + set +a +fi + echo "Starting curio node..." CURIO_FAKE_CPU=5 curio run --nosync --name devnet --layers seal,post,pdp-only,gui -sleep infinity diff --git a/forest/scripts/start-forest.sh b/forest/scripts/start-forest.sh index 6c302ba1..f8232146 100755 --- a/forest/scripts/start-forest.sh +++ b/forest/scripts/start-forest.sh @@ -73,11 +73,33 @@ else generate_forest_config fi -forest --genesis "${SHARED_CONFIGS}/devgen.car" \ - --config "${FOREST_DATA_DIR}/forest_config.toml" \ - --rpc-address "${host_ip}:${FOREST_RPC_PORT}" \ - --p2p-listen-address "/ip4/${host_ip}/tcp/${FOREST_P2P_PORT}" \ - --healthcheck-address "${host_ip}:${FOREST_HEALTHZ_RPC_PORT}" & +# Launch daemon with retry — on restart the network interface may not be ready yet, +# causing bind failures. Retry up to 10 times with backoff. +launch_forest() { + forest --genesis "${SHARED_CONFIGS}/devgen.car" \ + --config "${FOREST_DATA_DIR}/forest_config.toml" \ + --rpc-address "${host_ip}:${FOREST_RPC_PORT}" \ + --p2p-listen-address "/ip4/${host_ip}/tcp/${FOREST_P2P_PORT}" \ + --healthcheck-address "${host_ip}:${FOREST_HEALTHZ_RPC_PORT}" & + FOREST_PID=$! +} + +MAX_LAUNCH_RETRIES=10 +for (( attempt=1; attempt<=MAX_LAUNCH_RETRIES; attempt++ )); do + launch_forest + sleep 3 + if kill -0 "$FOREST_PID" 2>/dev/null; then + echo "forest${node_number}: daemon started (pid=$FOREST_PID)" + break + fi + echo "forest${node_number}: daemon exited early (attempt $attempt/$MAX_LAUNCH_RETRIES), retrying in 5s..." + sleep 5 +done + +if ! kill -0 "$FOREST_PID" 2>/dev/null; then + echo "ERROR: forest${node_number} daemon failed to start after $MAX_LAUNCH_RETRIES attempts" + exit 1 +fi export TOKEN=$(cat "${FOREST_DATA_DIR}/jwt") export FULLNODE_API_INFO="$TOKEN:/ip4/$host_ip/tcp/${FOREST_RPC_PORT}/http" @@ -152,4 +174,4 @@ forest-cli healthcheck healthy --healthcheck-port "${FOREST_HEALTHZ_RPC_PORT}" echo "forest${node_number}: completed startup" -sleep infinity +wait $FOREST_PID diff --git a/lotus/lotus-config.toml.template b/lotus/lotus-config.toml.template index 75ba00c5..1661d28e 100644 --- a/lotus/lotus-config.toml.template +++ b/lotus/lotus-config.toml.template @@ -4,11 +4,13 @@ # # type: string # env var: LOTUS_API_LISTENADDRESS - ListenAddress = "/ip4/${host_ip}/tcp/${LOTUS_RPC_PORT}/http" + ListenAddress = "/ip4/0.0.0.0/tcp/${LOTUS_RPC_PORT}/http" # type: string # env var: LOTUS_API_REMOTELISTENADDRESS - #RemoteListenAddress = "" + # This is what gets written to the api file — used by miners and other clients. + # Must use the container hostname (DNS), not IP, so it survives IP changes after restarts. + RemoteListenAddress = "/dns4/${hostname}/tcp/${LOTUS_RPC_PORT}/http" # type: Duration # env var: LOTUS_API_TIMEOUT diff --git a/lotus/scripts/start-lotus-miner.sh b/lotus/scripts/start-lotus-miner.sh index e67cd9ef..b170f5ad 100755 --- a/lotus/scripts/start-lotus-miner.sh +++ b/lotus/scripts/start-lotus-miner.sh @@ -18,16 +18,20 @@ export CGO_CFLAGS_ALLOW="-D__BLST_PORTABLE__" export CGO_CFLAGS="-D__BLST_PORTABLE__" lotus-miner --version -lotus wallet import --as-default "${SHARED_CONFIGS}/.genesis-sector-${node_number}/pre-seal-${LOTUS_MINER_ACTOR_ADDRESS}.key" +lotus wait-api +lotus wallet import --as-default "${SHARED_CONFIGS}/.genesis-sector-${node_number}/pre-seal-${LOTUS_MINER_ACTOR_ADDRESS}.key" || true -if [ -f "${LOTUS_MINER_PATH}/config.toml" ]; then - echo "lotus-miner${node_number}: Repo already exists, skipping init..." +if [ -f "${LOTUS_MINER_PATH}/.init.complete" ]; then + echo "lotus-miner${node_number}: Restart detected, skipping init..." else + # Clean up any partial repo state from a prior interrupted init + rm -rf "${LOTUS_MINER_PATH}" if [ "$node_number" -eq 0 ]; then lotus-miner init --genesis-miner --actor=${LOTUS_MINER_ACTOR_ADDRESS} --sector-size=2KiB --pre-sealed-sectors=${SHARED_CONFIGS}/.genesis-sector-${node_number} --pre-sealed-metadata=${SHARED_CONFIGS}/manifest.json --nosync else lotus-miner init --actor=${LOTUS_MINER_ACTOR_ADDRESS} --sector-size=2KiB --pre-sealed-sectors=${SHARED_CONFIGS}/.genesis-sector-${node_number} --pre-sealed-metadata=${SHARED_CONFIGS}/manifest.json --nosync fi + touch "${LOTUS_MINER_PATH}/.init.complete" fi echo "lotus-miner${node_number}: setup complete" diff --git a/lotus/scripts/start-lotus.sh b/lotus/scripts/start-lotus.sh index 16579035..a3e52311 100755 --- a/lotus/scripts/start-lotus.sh +++ b/lotus/scripts/start-lotus.sh @@ -20,10 +20,14 @@ export LOTUS_SKIP_GENESIS_CHECK=${LOTUS_SKIP_GENESIS_CHECK} export CGO_CFLAGS_ALLOW="-D__BLST_PORTABLE__" export CGO_CFLAGS="-D__BLST_PORTABLE__" -if [ ! -f "${LOTUS_DATA_DIR}/config.toml" ]; then - INIT_MODE=true -else +# Use a dedicated sentinel instead of config.toml — the daemon creates config.toml +# early during init, so a restart mid-genesis would falsely skip init. +if [ -f "${LOTUS_DATA_DIR}/.init.complete" ]; then INIT_MODE=false +else + INIT_MODE=true + # Clean up any partial repo state from a prior interrupted init + rm -rf "${LOTUS_PATH}" fi MAX_DRAND_RETRIES=60 @@ -48,6 +52,9 @@ while true; do fi done +# Always regenerate config — Antithesis may assign a new IP after container restart +sed "s|\${LOTUS_RPC_PORT}|$LOTUS_RPC_PORT|g; s|\${hostname}|lotus${node_number}|g" config.toml.template > config.toml + if [ "$INIT_MODE" = "true" ]; then host_ip=$(getent hosts "lotus${node_number}" | awk '{ print $1 }') @@ -55,8 +62,6 @@ if [ "$INIT_MODE" = "true" ]; then echo "ip address: $host_ip" echo "---------------------------" - sed "s|\${host_ip}|$host_ip|g; s|\${LOTUS_RPC_PORT}|$LOTUS_RPC_PORT|g" config.toml.template > config.toml - if [ "$node_number" -eq 0 ]; then ./scripts/setup-genesis.sh fi @@ -69,9 +74,12 @@ if [ "$INIT_MODE" = "true" ]; then lotus --repo="${LOTUS_PATH}" daemon --genesis=${SHARED_CONFIGS}/devgen.car --bootstrap=false --config=config.toml& fi else - lotus --repo="${LOTUS_PATH}" daemon --bootstrap=false --config=config.toml& + echo "lotus${node_number}: Restart detected, skipping init..." + lotus --repo="${LOTUS_PATH}" daemon --bootstrap=false --config=config.toml & fi +LOTUS_PID=$! + lotus --version lotus wait-api @@ -81,6 +89,12 @@ if [ ! -f "${LOTUS_DATA_DIR}/lotus${node_number}-jwt" ]; then lotus auth create-token --perm admin > "${LOTUS_DATA_DIR}/lotus${node_number}-jwt" fi +# Mark init as complete — must be after daemon is confirmed running +if [ "$INIT_MODE" = "true" ]; then + touch "${LOTUS_DATA_DIR}/.init.complete" + echo "lotus${node_number}: Init complete, sentinel written" +fi + connect_with_retries() { local max_retries=10 local addr_file="$1" @@ -127,4 +141,4 @@ done echo "lotus${node_number}: completed startup" -sleep infinity +wait $LOTUS_PID