From a780ed58874432a97de12bdb48fe9d0a13ee487e Mon Sep 17 00:00:00 2001 From: Juliana Oliveira Date: Fri, 16 Jan 2026 20:06:27 -0300 Subject: [PATCH 1/3] fix: removes original entrypoint and cleans up orphaned shared memory The original entrypoint (bootstrap-postgres-ha.sh) somehow doesn't exist anymore. This is probably due to an update in the base image that we missed. Funny enough, the postgres-operator overrides the entrypoint, so we haven't seen any failures. This patch will be accompanied by an update to postgres-operator that uses the entrypoint-wrapper if a new flag file is present (required for backwards compatibility and safe rollout): /usr/local/bin/.entrypoint-wrapper-version Additionally, we add to the postgres-oom-adjuster script shared memory cleanup when it can't find the postmaster. This is a workaround that allow us to recover from PID reuse failures caused by patroni concurrently demoting a replica. The proper fix is upgrading to Patroni >= 4.0.6. Ref.: https://github.com/patroni/patroni/pull/3372 Signed-off-by: Juliana Oliveira --- Dockerfile | 8 ++++++++ entrypoint-wrapper.sh | 26 ++++++++++++++++++-------- postgres-oom-adjuster.sh | 27 ++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 42a4cb1..dc11f9a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,5 +7,13 @@ USER 0 COPY postgres-oom-adjuster.sh /usr/local/bin/ COPY entrypoint-wrapper.sh /usr/local/bin/ +# NOTE: We use this file to check if we can execute the entrypoint wrapper with arguments. +# The previous version didn't allow arguments, so if we'd try, we'd break the replica. +# We didn't catch this earlier because postgres-operator is overriding the entrypoint by +# setting a pod command param to execute patroni directly. This change will be accompanied +# by a postgres-operator patch to conditionally call the wrapper when this file exists, +# allowing us to use the wrapper and the oom adjuster. +RUN echo "2" > /usr/local/bin/.entrypoint-wrapper-version + # We don't change the original ENTRYPOINT - we wrap around it ENTRYPOINT ["/usr/local/bin/entrypoint-wrapper.sh"] diff --git a/entrypoint-wrapper.sh b/entrypoint-wrapper.sh index 99c6cc9..b278007 100755 --- a/entrypoint-wrapper.sh +++ b/entrypoint-wrapper.sh @@ -3,13 +3,23 @@ set -e # For Percona PostgreSQL Operator, we need to focus on protecting the postmaster process # Original user in the Percona PostgreSQL Operator image -ORIGINAL_USER=26 -ORIGINAL_ENTRYPOINT="/opt/crunchy/bin/postgres-ha/bootstrap-postgres-ha.sh" +ORIGINAL_USER=26 # (postgres user) + +# Ensure a command was provided +if [ $# -eq 0 ]; then + echo "ERROR: No command provided to entrypoint-wrapper.sh" + echo "Usage: entrypoint-wrapper.sh [args...]" + exit 1 +fi # Set OOM score adjustment for our own process (will be inherited) if [ -f "/proc/self/oom_score_adj" ]; then - echo -900 > /proc/self/oom_score_adj - echo "Set OOM score adjustment to -900 for pid 1" + if echo -900 > /proc/self/oom_score_adj 2>/dev/null; then + echo "Set OOM score adjustment to -900 for pid 1" + else + # If we fail, let the postgres oom adjuster handle it. + echo "WARNING: Cannot set OOM score adjustment (will retry via background adjuster)" + fi else echo "WARNING: Cannot set OOM score adjustment (file not found)" fi @@ -22,14 +32,14 @@ echo "Started postmaster OOM adjuster in background" export PG_OOM_ADJUST_FILE=/proc/self/oom_score_adj export PG_OOM_ADJUST_VALUE=0 -# Switching to the original user and executing original entrypoint -echo "Switching to user $ORIGINAL_USER and executing original entrypoint: $ORIGINAL_ENTRYPOINT $@" +# Switching to the original user and executing command +echo "Switching to user $ORIGINAL_USER and executing: $@" # Check which user-switching command is available if command -v runuser >/dev/null 2>&1; then # Use runuser (available on RHEL/CentOS/Fedora) - exec runuser -u "#$ORIGINAL_USER" -- "$ORIGINAL_ENTRYPOINT" "$@" + exec runuser -u "#$ORIGINAL_USER" -- "$@" else # Fall back to su - exec su -s /bin/bash $ORIGINAL_USER -c "$ORIGINAL_ENTRYPOINT $*" + exec su -s /bin/bash $ORIGINAL_USER -c "$*" fi diff --git a/postgres-oom-adjuster.sh b/postgres-oom-adjuster.sh index 63294c9..9f1cdf4 100755 --- a/postgres-oom-adjuster.sh +++ b/postgres-oom-adjuster.sh @@ -43,11 +43,32 @@ while true; do if [ "$CURRENT_SCORE" != "-900" ]; then echo -900 > /proc/$POSTMASTER_PID/oom_score_adj echo "$(date): Adjusted OOM score for postmaster PID $POSTMASTER_PID from $CURRENT_SCORE to -900" >> $LOG_FILE - else - echo "$(date): Postmaster PID $POSTMASTER_PID already has OOM score -900" >> $LOG_FILE fi else - echo "$(date): Postmaster not found yet or OOM score file not accessible" >> $LOG_FILE + # Band-aid for the Postmaster PID problem. Actual fix is in Patroni >= 4.0.6. + # Ref.: https://github.com/patroni/patroni/pull/3372 + echo "$(date): Postmaster not found, checking for orphaned shared memory" >> $LOG_FILE + + # Clean up orphaned POSIX shared memory segments + # These block PostgreSQL startup if left behind after a crash + if [ -d "/dev/shm" ]; then + for shm_file in /dev/shm/PostgreSQL.*; do + if [ -f "$shm_file" ]; then + echo "$(date): Removing orphaned POSIX shared memory: $shm_file" >> $LOG_FILE + rm -f "$shm_file" 2>/dev/null + fi + done + fi + + # Clean up orphaned System V shared memory segments (nattch=0 means no attached processes) + if command -v ipcs >/dev/null 2>&1 && command -v ipcrm >/dev/null 2>&1; then + for shmid in $(ipcs -m 2>/dev/null | awk 'NR>3 && $6==0 {print $2}'); do + if [ -n "$shmid" ]; then + echo "$(date): Removing orphaned SysV shared memory segment: $shmid" >> $LOG_FILE + ipcrm -m "$shmid" 2>/dev/null + fi + done + fi fi # Sleep for a while before checking again From b8c5b47df443697860e949c5f34d34c1a1894e55 Mon Sep 17 00:00:00 2001 From: Juliana Oliveira Date: Fri, 16 Jan 2026 20:56:31 -0300 Subject: [PATCH 2/3] add workflow for deploying pr images so we can test them Signed-off-by: Juliana Oliveira --- .github/workflows/build-pr.yml | 60 ++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 .github/workflows/build-pr.yml diff --git a/.github/workflows/build-pr.yml b/.github/workflows/build-pr.yml new file mode 100644 index 0000000..4ff9afc --- /dev/null +++ b/.github/workflows/build-pr.yml @@ -0,0 +1,60 @@ +name: Build PR Image + +on: + pull_request: + branches: + - main + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}/mpg-operator-postgres + +jobs: + build-pr: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + platforms: linux/amd64,linux/arm64 + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push - Standard (PG16) + uses: docker/build-push-action@v5 + with: + context: . + build-args: | + BASE_IMAGE_TAG=2.6.0-ppg16.8-postgres + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:pr-${{ github.event.pull_request.number }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Build and push - PG17 + uses: docker/build-push-action@v5 + with: + context: . + build-args: | + BASE_IMAGE_TAG=2.6.0-ppg17.4-postgres + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:pr-${{ github.event.pull_request.number }}-pg17 + cache-from: type=gha + cache-to: type=gha,mode=max From 30cea5e87bc64d2d675e8332b1672ffb3a105f56 Mon Sep 17 00:00:00 2001 From: Juliana Oliveira Date: Mon, 19 Jan 2026 14:54:04 -0300 Subject: [PATCH 3/3] fix fks already runs as user 26 Signed-off-by: Juliana Oliveira --- Dockerfile | 9 +++++---- entrypoint-wrapper.sh | 9 +++++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index dc11f9a..9be726b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,11 @@ ARG BASE_IMAGE_TAG=2.6.0-ppg16.8-postgres FROM percona/percona-postgresql-operator:${BASE_IMAGE_TAG} -# Switch to root user temporarily to gain necessary privileges for setup -USER 0 - COPY postgres-oom-adjuster.sh /usr/local/bin/ COPY entrypoint-wrapper.sh /usr/local/bin/ +# Switch user for the run command. +USER 0 # NOTE: We use this file to check if we can execute the entrypoint wrapper with arguments. # The previous version didn't allow arguments, so if we'd try, we'd break the replica. # We didn't catch this earlier because postgres-operator is overriding the entrypoint by @@ -15,5 +14,7 @@ COPY entrypoint-wrapper.sh /usr/local/bin/ # allowing us to use the wrapper and the oom adjuster. RUN echo "2" > /usr/local/bin/.entrypoint-wrapper-version -# We don't change the original ENTRYPOINT - we wrap around it +# Switch back to postgres user +USER 26 + ENTRYPOINT ["/usr/local/bin/entrypoint-wrapper.sh"] diff --git a/entrypoint-wrapper.sh b/entrypoint-wrapper.sh index b278007..7b28234 100755 --- a/entrypoint-wrapper.sh +++ b/entrypoint-wrapper.sh @@ -32,8 +32,13 @@ echo "Started postmaster OOM adjuster in background" export PG_OOM_ADJUST_FILE=/proc/self/oom_score_adj export PG_OOM_ADJUST_VALUE=0 -# Switching to the original user and executing command -echo "Switching to user $ORIGINAL_USER and executing: $@" +CURRENT_UID=$(id -u) +if [ "$CURRENT_UID" = "$ORIGINAL_USER" ]; then + echo "Already running as user $ORIGINAL_USER (uid=$CURRENT_UID), executing: $@" + exec "$@" +fi + +echo "Running as uid=$CURRENT_UID, switching to user $ORIGINAL_USER and executing: $@" # Check which user-switching command is available if command -v runuser >/dev/null 2>&1; then