Skip to content

Commit 078cf47

Browse files
authored
Merge branch 'main' into iceberg_10
2 parents e138bd9 + 14ce5c2 commit 078cf47

File tree

16 files changed

+139
-93
lines changed

16 files changed

+139
-93
lines changed

.github/workflows/docker_image.yml

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ jobs:
159159
runs-on: ${{ matrix.os }}
160160
strategy:
161161
matrix:
162-
os: [ ubuntu-latest ] #TODO: adding arm back
162+
os: [ ubuntu-latest, ubuntu-24.04-arm ]
163163

164164
steps:
165165
- name: Checkout repository
@@ -385,10 +385,9 @@ jobs:
385385
runs-on: ubuntu-latest
386386
strategy:
387387
matrix:
388-
digests: [ vcpkg-centos-8, vcpkg-centos-8-gcc13, vcpkg-centos-9, centos-8-jdk8, centos-8-jdk11, centos-8-jdk17, centos-9-jdk8, centos-9-jdk11, centos-9-jdk17 ]
388+
digests: [ vcpkg-centos-8, vcpkg-centos-9, centos-8-jdk8, centos-8-jdk11, centos-8-jdk17, centos-9-jdk8, centos-9-jdk11, centos-9-jdk17 ]
389389
needs:
390390
- build-vcpkg-centos-8
391-
- build-vcpkg-centos-8-gcc13
392391
- build-vcpkg-centos-9
393392
- build-centos-8
394393
- build-centos-9
@@ -397,7 +396,7 @@ jobs:
397396
uses: actions/download-artifact@v4
398397
with:
399398
path: ${{ runner.temp }}/digests
400-
pattern: digests-${{ matrix.digests }}-*
399+
pattern: digests-${{ matrix.digests }}-ubuntu*
401400
merge-multiple: true
402401

403402
- name: Login to Docker Hub
@@ -425,4 +424,45 @@ jobs:
425424
- name: Inspect image
426425
run: |
427426
docker buildx imagetools inspect ${{ env.DOCKERHUB_REPO }}:${{ steps.meta.outputs.version }}
427+
428+
merge-gcc13:
429+
if: ${{ startsWith(github.repository, 'apache/') }}
430+
runs-on: ubuntu-latest
431+
strategy:
432+
matrix:
433+
digests: [ vcpkg-centos-8-gcc13 ]
434+
needs:
435+
- build-vcpkg-centos-8-gcc13
436+
steps:
437+
- name: Download digests
438+
uses: actions/download-artifact@v4
439+
with:
440+
path: ${{ runner.temp }}/digests
441+
pattern: digests-${{ matrix.digests }}-*
442+
merge-multiple: true
428443

444+
- name: Login to Docker Hub
445+
uses: docker/login-action@v2
446+
with:
447+
username: ${{ secrets.DOCKERHUB_USER }}
448+
password: ${{ secrets.DOCKERHUB_TOKEN }}
449+
450+
- name: Set up Docker Buildx
451+
uses: docker/setup-buildx-action@v3
452+
453+
- name: Docker meta
454+
id: meta
455+
uses: docker/metadata-action@v5
456+
with:
457+
images: ${{ env.DOCKERHUB_REPO }}
458+
tags: ${{ matrix.digests }}
459+
460+
- name: Create manifest list and push
461+
working-directory: ${{ runner.temp }}/digests
462+
run: |
463+
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
464+
$(printf '${{ env.DOCKERHUB_REPO }}@sha256:%s ' *)
465+
466+
- name: Inspect image
467+
run: |
468+
docker buildx imagetools inspect ${{ env.DOCKERHUB_REPO }}:${{ steps.meta.outputs.version }}

.github/workflows/velox_backend_x86.yml

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,8 @@ jobs:
555555
fail-fast: false
556556
matrix:
557557
spark: [ "spark-3.2" ]
558-
celeborn: [ "celeborn-0.6.1", "celeborn-0.5.4", "celeborn-0.4.3"]
558+
celeborn: [ "celeborn-0.6.1", "celeborn-0.5.4"]
559+
writer: [ "sort", "hash"]
559560
runs-on: ubuntu-22.04
560561
container: apache/gluten:centos-8-jdk8
561562
steps:
@@ -577,9 +578,7 @@ jobs:
577578
- name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2 with ${{ matrix.celeborn }}
578579
run: |
579580
EXTRA_PROFILE=""
580-
if [ "${{ matrix.celeborn }}" = "celeborn-0.4.3" ]; then
581-
EXTRA_PROFILE="-Pceleborn-0.4"
582-
elif [ "${{ matrix.celeborn }}" = "celeborn-0.5.4" ]; then
581+
if [ "${{ matrix.celeborn }}" = "celeborn-0.5.4" ]; then
583582
EXTRA_PROFILE="-Pceleborn-0.5"
584583
elif [ "${{ matrix.celeborn }}" = "celeborn-0.6.1" ]; then
585584
EXTRA_PROFILE="-Pceleborn-0.6"
@@ -592,23 +591,31 @@ jobs:
592591
cd /opt && mkdir -p celeborn && \
593592
tar xzf apache-${{ matrix.celeborn }}-bin.tgz -C /opt/celeborn --strip-components=1 && cd celeborn && \
594593
mv ./conf/celeborn-env.sh.template ./conf/celeborn-env.sh && \
595-
bash -c "echo -e 'CELEBORN_MASTER_MEMORY=4g\nCELEBORN_WORKER_MEMORY=4g\nCELEBORN_WORKER_OFFHEAP_MEMORY=8g' > ./conf/celeborn-env.sh" && \
596-
bash -c "echo -e 'celeborn.worker.commitFiles.threads 128\nceleborn.worker.sortPartition.threads 64' > ./conf/celeborn-defaults.conf" && \
594+
bash -c "echo -e 'CELEBORN_MASTER_MEMORY=8g\nCELEBORN_WORKER_MEMORY=8g\nCELEBORN_WORKER_OFFHEAP_MEMORY=16g' > ./conf/celeborn-env.sh" && \
595+
bash -c "echo -e 'celeborn.worker.commitFiles.threads 32\nceleborn.worker.sortPartition.threads 16' > ./conf/celeborn-defaults.conf" && \
597596
bash ./sbin/start-master.sh && bash ./sbin/start-worker.sh && \
598597
cd $GITHUB_WORKSPACE/tools/gluten-it && $MVN_CMD clean install -Pspark-3.2 -Pceleborn ${EXTRA_PROFILE} && \
599-
GLUTEN_IT_JVM_ARGS=-Xmx10G sbin/gluten-it.sh queries-compare \
600-
--local --preset=velox-with-celeborn --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=8 --iterations=1 && \
601-
GLUTEN_IT_JVM_ARGS=-Xmx10G sbin/gluten-it.sh queries-compare \
602-
--local --preset=velox-with-celeborn --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=8 --iterations=1 && \
603-
GLUTEN_IT_JVM_ARGS=-Xmx10G sbin/gluten-it.sh queries-compare \
604-
--local --preset=velox-with-celeborn --extra-conf=spark.celeborn.client.spark.shuffle.writer=sort \
605-
--extra-conf=spark.celeborn.push.sortMemory.threshold=8m --benchmark-type=ds --error-on-memleak \
606-
--off-heap-size=10g -s=1.0 --threads=8 --iterations=1
607-
GLUTEN_IT_JVM_ARGS=-Xmx10G sbin/gluten-it.sh queries-compare \
608-
--local --preset=velox-with-celeborn --extra-conf=spark.celeborn.client.spark.shuffle.writer=sort \
609-
--extra-conf=spark.gluten.sql.columnar.shuffle.celeborn.useRssSort=false \
610-
--extra-conf=spark.celeborn.push.sortMemory.threshold=8m --benchmark-type=ds --error-on-memleak \
611-
--off-heap-size=10g -s=1.0 --threads=8 --iterations=1
598+
GLUTEN_IT_JVM_ARGS=-Xmx16G sbin/gluten-it.sh queries-compare \
599+
--extra-conf=spark.celeborn.client.spark.shuffle.writer=${{ matrix.writer }} \
600+
--extra-conf=spark.sql.shuffle.partitions=16 \
601+
--extra-conf=spark.celeborn.client.eagerlyCreateInputStream.threads=4 \
602+
--local --preset=velox-with-celeborn --benchmark-type=h --error-on-memleak --off-heap-size=16g -s=1.0 --threads=16 --iterations=1 && \
603+
GLUTEN_IT_JVM_ARGS=-Xmx16G sbin/gluten-it.sh queries-compare \
604+
--local --preset=velox-with-celeborn --extra-conf=spark.celeborn.client.spark.shuffle.writer=${{ matrix.writer }} \
605+
--extra-conf=spark.gluten.sql.columnar.shuffle.celeborn.useRssSort=true \
606+
--extra-conf=spark.sql.shuffle.partitions=16 \
607+
--extra-conf=spark.celeborn.client.eagerlyCreateInputStream.threads=4 \
608+
--benchmark-type=ds --error-on-memleak \
609+
--off-heap-size=16g -s=1.0 --threads=16 --iterations=1
610+
if [ "${{ matrix.writer }}" = "sort" ]; then
611+
GLUTEN_IT_JVM_ARGS=-Xmx16G sbin/gluten-it.sh queries-compare \
612+
--local --preset=velox-with-celeborn --extra-conf=spark.celeborn.client.spark.shuffle.writer=${{ matrix.writer }} \
613+
--extra-conf=spark.gluten.sql.columnar.shuffle.celeborn.useRssSort=false \
614+
--extra-conf=spark.celeborn.client.eagerlyCreateInputStream.threads=4 \
615+
--extra-conf=spark.sql.shuffle.partitions=16 \
616+
--benchmark-type=ds --error-on-memleak \
617+
--off-heap-size=16g -s=1.0 --threads=16 --iterations=1
618+
fi
612619
613620
spark-test-spark32:
614621
needs: build-native-lib-centos-7

cpp/velox/jni/VeloxJniWrapper.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -559,8 +559,13 @@ Java_org_apache_gluten_datasource_VeloxDataSourceJniWrapper_splitBlockByPartitio
559559
const auto inputRowVector = veloxBatch->getRowVector();
560560
const auto numRows = inputRowVector->size();
561561

562-
connector::hive::PartitionIdGenerator idGen{
563-
asRowType(inputRowVector->type()), partitionColIndicesVec, 128, pool.get(), true};
562+
connector::hive::PartitionIdGenerator idGen(
563+
asRowType(inputRowVector->type()), partitionColIndicesVec, 128, pool.get()
564+
#ifdef GLUTEN_ENABLE_ENHANCED_FEATURES
565+
,
566+
true
567+
#endif
568+
);
564569
raw_vector<uint64_t> partitionIds{};
565570
idGen.run(inputRowVector, partitionIds);
566571
GLUTEN_CHECK(partitionIds.size() == numRows, "Mismatched number of partition ids");

dev/docker/Dockerfile.centos8-dynamic-build

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ RUN set -ex; \
4040
tar -xvf ${local_binary}; \
4141
mv apache-maven-${maven_version} /usr/lib/maven; \
4242
rm -rf ${local_binary}; \
43-
wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.4.3/apache-celeborn-0.4.3-bin.tgz -P /opt/; \
4443
wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.5.4/apache-celeborn-0.5.4-bin.tgz -P /opt/; \
4544
wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.6.1/apache-celeborn-0.6.1-bin.tgz -P /opt/; \
4645
wget -nv https://archive.apache.org/dist/incubator/uniffle/0.9.2/apache-uniffle-0.9.2-incubating-bin.tar.gz -P /opt/; \

dev/docker/Dockerfile.centos8-gcc13

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,19 @@
1616
# copied from https://github.com/jeromerobert/centos7-gcc13
1717
FROM centos:8 as base0
1818

19-
FROM base0 as base
19+
FROM base0 as base1
2020
RUN /usr/bin/sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-*; sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-*;
2121
RUN yum update -y && yum install epel-release wget perl -y
22-
RUN yum -y install ftp://ftp.icm.edu.pl/vol/rzm5/linux-centos-vault/8.0.1905/PowerTools/aarch64/kickstart/Packages/perl-Unicode-EastAsianWidth-1.33-13.el8.noarch.rpm ftp://ftp.icm.edu.pl/vol/rzm5/linux-centos-vault/8.2.2004/PowerTools/x86_64/kickstart/Packages/texinfo-6.5-6.el8.x86_64.rpm
2322

23+
# texinfo
24+
FROM base1 as base
25+
RUN curl -kLO https://ftp.gnu.org/gnu/texinfo/texinfo-6.8.tar.xz
26+
RUN tar xf texinfo*.tar.*
27+
WORKDIR build-texinfo
28+
RUN yum -y install gcc make m4
29+
RUN ../texinfo*/configure
30+
RUN make -j$(nproc)
31+
RUN make install
2432

2533
# Git
2634
FROM base as git

dev/docker/Dockerfile.centos8-gcc13-static-build

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
FROM inteldpo/gluten:centos8_gcc13
16+
FROM inteldpo/gluten-ci-images:centos-8_gcc13
1717

1818

1919
ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
@@ -27,15 +27,16 @@ ENV VCPKG_BINARY_SOURCES=clear;files,${VCPKG_PATH},readwrite
2727

2828
RUN set -ex; \
2929
yum update -y && yum install -y epel-release sudo dnf && yum install -y ccache; \
30-
dnf install -y --setopt=install_weak_deps=False gcc-toolset-11; \
3130
echo "check_certificate = off" >> ~/.wgetrc; \
32-
yum install -y java-1.8.0-openjdk-devel patch wget git perl; \
31+
yum install -y java-1.8.0-openjdk-devel patch wget git perl python3 automake libtool flex; \
32+
dnf -y --enablerepo=powertools install autoconf-archive ninja-build; \
33+
pip3 install --upgrade pip; \
34+
pip3 install cmake; \
3335
rpm -qa | grep tzdata; \
3436
dnf clean all; \
3537
git clone --depth=1 https://github.com/apache/incubator-gluten /opt/gluten; \
36-
cd /opt/gluten && bash ./dev/vcpkg/setup-build-depends.sh; \
37-
yum remove gcc -y && yum clean all; \
38-
yes | cp -rf /usr/share/aclocal/* /usr/local/share/aclocal/; \
38+
cd /opt/gluten; bash .github/workflows/util/setup-helper.sh install_maven; \
39+
export PATH=/usr/lib/maven/bin:$PATH; \
3940
mkdir -p ${VCPKG_PATH}; \
4041
echo "Build arrow, then install the native libs to system paths and jar package to .m2/ directory."; \
4142
if [ "$(uname -m)" = "aarch64" ]; then \

dev/docker/Dockerfile.centos9-dynamic-build

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ RUN set -ex; \
3838
tar -xvf ${local_binary}; \
3939
mv apache-maven-${maven_version} /usr/lib/maven; \
4040
rm -rf ${local_binary}; \
41-
wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.4.3/apache-celeborn-0.4.3-bin.tgz -P /opt/; \
4241
wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.5.4/apache-celeborn-0.5.4-bin.tgz -P /opt/; \
4342
wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.6.1/apache-celeborn-0.6.1-bin.tgz -P /opt/; \
4443
wget -nv https://archive.apache.org/dist/incubator/uniffle/0.9.2/apache-uniffle-0.9.2-incubating-bin.tar.gz -P /opt/; \

dev/vcpkg/CONTRIBUTING.md

Lines changed: 6 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Please init vcpkg env first:
1313

1414
Vcpkg already maintains a lot of libraries.
1515
You can find them by vcpkg cli.
16-
(NOTE: Please always use cli because [packages on vcpkg.io](https://vcpkg.io/en/packages.html) is outdate).
16+
(NOTE: Please always use cli because [packages on vcpkg.io](https://vcpkg.io/en/packages.html) is outdated).
1717

1818
```
1919
$ ./.vcpkg/vcpkg search folly
@@ -65,37 +65,11 @@ See also [Versioning](https://learn.microsoft.com/en-us/vcpkg/users/versioning).
6565

6666
Otherwise, you must create a new port in `./ports/$package` to override the vcpkg's original version.
6767

68-
**If a new ports has been merged in vcpkg main branch**.
69-
You can find git tree-ish and checkout it.
70-
For example, arrow 12.0.0 has been merged but not include in last release (2023.04.15).
71-
72-
``` patch
73-
# https://patch-diff.githubusercontent.com/raw/microsoft/vcpkg/pull/31321.patch
74-
75-
diff --git a/versions/a-/arrow.json b/versions/a-/arrow.json
76-
index 07c7ef67cb27c..f7bbe94b4f914 100644
77-
--- a/versions/a-/arrow.json
78-
+++ b/versions/a-/arrow.json
79-
@@ -1,5 +1,10 @@
80-
{
81-
"versions": [
82-
+ {
83-
+ "git-tree": "881bfaaab349dae46929b36e5b84e7036a009ad3",
84-
+ "version": "12.0.0",
85-
+ "port-version": 0
86-
+ },
87-
{
88-
"git-tree": "21fea47a1e9c7bf68e6c088ad5a6b7b6e33c2fcb",
89-
"version": "11.0.0",
90-
```
91-
92-
Git tree-ish is `21fea47a1e9c7bf68e6c088ad5a6b7b6e33c2fcb`. Then fetch and checkout it.
93-
94-
``` sh
95-
cd .vcpkg
96-
git fetch origin master
97-
git archive 21fea47a1e9c7bf68e6c088ad5a6b7b6e33c2fcb | tar -x -C ../ports/arrow
98-
```
68+
**If a newer version of a library is supported in a later version of vcpkg**.
69+
Developers can configure `vcpkg-configuration.json` to allow importing a new port, overriding the
70+
corresponding port specified in the `builtin-baseline`. This approach also applies to libraries
71+
that have been removed from vcpkg at the `builtin-baseline`; by setting a historical vcpkg version,
72+
you can import such ports.
9973

10074
**If you want to modify port based on vcpkg version**.
10175
Copy port directory from `./.vcpkg/ports/$package` to `./ports/$package`.

dev/vcpkg/README.md

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,18 @@
11
# Build Gluten + Velox in Vcpkg Environment
22

3-
## Build in Docker
3+
## Overview
4+
5+
Currently, the `builtin-baseline` set in `vcpkg.json` is the commit hash for the `2025.09.17` tag of vcpkg.
6+
The versions of all dependency libraries are determined by their respective ports at this vcpkg version,
7+
except for those overridden in `vcpkg.json`, `vcpkg-configuration.json`, and overlay ports.
8+
9+
## Build in docker
10+
11+
For main branch code, you can follow the commands below.
12+
- Pull the docker image: `docker pull apache/gluten:vcpkg-centos-7`
13+
- Build native code: `bash dev/ci-velox-buildstatic-centos-7.sh`
14+
- Build JVM code: `mvn clean install -Pbackends-velox -Pspark-3.5 -DskipTests`
415

5-
Please install make and docker on your system, then `make`.
616
The gluten packages will be placed in `$GLUTEN_REPO/package/target/gluten-velox-bundle-*.jar`.
717

818
## Setup build environment manually
@@ -15,7 +25,7 @@ Please install build depends on your system to compile all libraries:
1525
sudo $GLUTEN_REPO/dev/vcpkg/setup-build-depends.sh
1626
```
1727

18-
GCC-11 is the minimum required compiler. It needs to be enabled beforehand. Take Centos-7/8 as example:
28+
GCC 11 is the minimum required compiler. It needs to be enabled beforehand. Take Centos 7/8 as example:
1929

2030
``` sh
2131
# CentOS 8
@@ -32,7 +42,7 @@ For unsupported linux distro, you can install the following packages from packag
3242
* wget
3343
* curl
3444
* git >= 2.7.4
35-
* gcc >= 9
45+
* gcc >= 11
3646
* pkg-config
3747
* autotools
3848
* flex >= 2.6.0
@@ -48,4 +58,4 @@ You can configure [binary cache](https://learn.microsoft.com/en-us/vcpkg/users/b
4858

4959
``` sh
5060
$GLUTEN_REPO/dev/buildbundle-veloxbe.sh --enable_vcpkg=ON --build_tests=ON --build_benchmarks=ON --enable_s3=ON --enable_hdfs=ON
51-
```
61+
```

dev/vcpkg/init.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,12 @@ SCRIPT_ROOT="$(realpath "$(dirname "$0")")"
6868
cd "$SCRIPT_ROOT"
6969

7070
if [ ! -d "$VCPKG_ROOT" ] || [ -z "$(ls "$VCPKG_ROOT")" ]; then
71-
git clone https://github.com/microsoft/vcpkg.git --branch 2023.10.19 "$VCPKG_ROOT"
71+
# The builtin-baseline (commit hash) specified in vcpkg.json should exist in this branch.
72+
# Therefore, upgrading the builtin-baseline may require updating the branch.
73+
git clone https://github.com/microsoft/vcpkg.git --branch 2025.09.17 "$VCPKG_ROOT"
7274
fi
7375
[ -f "$VCPKG" ] || "$VCPKG_ROOT/bootstrap-vcpkg.sh" -disableMetrics
7476

75-
sed -i "s/3.27.1/3.28.3/g" $VCPKG_ROOT/scripts/vcpkgTools.xml
76-
sed -i "s/192374a68e2971f04974a194645726196d9b8ee7abd650d1e6f65f7aa2ccc9b186c3edb473bb4958c764532edcdd42f4182ee1fcb86b17d78b0bcd6305ce3df1/bd311ca835ef0914952f21d70d1753564d58de2ede02e80ede96e78cd2f40b4189e006007643ebb37792e13edd97eb4a33810bc8aca1eab6dd428eaffe1d2e38/g" $VCPKG_ROOT/scripts/vcpkgTools.xml
77-
7877
EXTRA_FEATURES=""
7978
if [ "$BUILD_TESTS" = "ON" ]; then
8079
EXTRA_FEATURES+="--x-feature=duckdb "

0 commit comments

Comments
 (0)