diff --git a/go.mod b/go.mod index 45752954788..c047f5c259c 100644 --- a/go.mod +++ b/go.mod @@ -94,7 +94,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/receiver/kafkareceiver v0.138.0 github.com/open-telemetry/opentelemetry-collector-contrib/receiver/opencensusreceiver v0.133.0 github.com/open-telemetry/opentelemetry-collector-contrib/receiver/zipkinreceiver v0.138.0 - github.com/parquet-go/parquet-go v0.25.2-0.20250911172247-41fe9a8fbd81 + github.com/parquet-go/parquet-go v0.25.2-0.20251113212313-bb7dcf6d014e github.com/twmb/franz-go v1.20.3 github.com/twmb/franz-go/pkg/kadm v1.17.1 github.com/twmb/franz-go/pkg/kfake v0.0.0-20251107035046-d7de41391da4 @@ -282,6 +282,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor v0.136.0 // indirect github.com/opentracing-contrib/go-stdlib v1.0.0 // indirect github.com/openzipkin/zipkin-go v0.4.3 // indirect + github.com/parquet-go/bitpack v0.1.0 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/philhofer/fwd v1.2.0 // indirect github.com/pires/go-proxyproto v0.7.0 // indirect diff --git a/go.sum b/go.sum index ae4b1ea6dea..6cfc95c3b29 100644 --- a/go.sum +++ b/go.sum @@ -747,8 +747,10 @@ github.com/openzipkin/zipkin-go v0.4.3 h1:9EGwpqkgnwdEIJ+Od7QVSEIH+ocmm5nPat0G7s github.com/openzipkin/zipkin-go v0.4.3/go.mod h1:M9wCJZFWCo2RiY+o1eBCEMe0Dp2S5LDHcMZmk3RmK7c= github.com/ovh/go-ovh v1.9.0 h1:6K8VoL3BYjVV3In9tPJUdT7qMx9h0GExN9EXx1r2kKE= github.com/ovh/go-ovh v1.9.0/go.mod h1:cTVDnl94z4tl8pP1uZ/8jlVxntjSIf09bNcQ5TJSC7c= -github.com/parquet-go/parquet-go v0.25.2-0.20250911172247-41fe9a8fbd81 h1:8puGZGSy7h3j6ku89ygASK1IIM/MBflJ57CkP5p1tHI= -github.com/parquet-go/parquet-go v0.25.2-0.20250911172247-41fe9a8fbd81/go.mod h1:QIVRG8YkQoopF7Cyg4LjEf/AifXw29R/X+A3ZCpJK/I= +github.com/parquet-go/bitpack v0.1.0 h1:zOo3XUOvwkWFmzG0pCbTCyZcO/BDXZDQCZ458pQOnH0= +github.com/parquet-go/bitpack v0.1.0/go.mod h1:XnVk9TH+O40eOOmvpAVZ7K2ocQFrQwysLMnc6M/8lgs= +github.com/parquet-go/parquet-go v0.25.2-0.20251113212313-bb7dcf6d014e h1:1JOiodbwMgJxfKZHYKZGmhCoNecZw5UZrulu41jMzb0= +github.com/parquet-go/parquet-go v0.25.2-0.20251113212313-bb7dcf6d014e/go.mod h1:Z1s1Wf0E38wyRhCq8XX+S0PrY/ofAeEiPR212wWNtYY= github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pborman/getopt v0.0.0-20170112200414-7148bc3a4c30/go.mod h1:85jBQOZwpVEaDAr341tbn15RS4fCAsIst0qp7i8ex1o= diff --git a/vendor/github.com/parquet-go/bitpack/.gitignore b/vendor/github.com/parquet-go/bitpack/.gitignore new file mode 100644 index 00000000000..b3584c8d4de --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/.gitignore @@ -0,0 +1,21 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib +*.py + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ + +# Emacs +*~ +#*# +.# diff --git a/vendor/github.com/parquet-go/bitpack/LICENSE b/vendor/github.com/parquet-go/bitpack/LICENSE new file mode 100644 index 00000000000..c3e15a69d15 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2025 Achille Roussel, Filip Petkovski + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/parquet-go/bitpack/README.md b/vendor/github.com/parquet-go/bitpack/README.md new file mode 100644 index 00000000000..1f02d8f3bcb --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/README.md @@ -0,0 +1,31 @@ +# bitpack + +[![Go Reference](https://pkg.go.dev/badge/github.com/parquet-go/bitpack.svg)](https://pkg.go.dev/github.com/parquet-go/bitpack) + +A high-performance Go library for bit packing and unpacking integers of various bit widths. Part of +the [parquet-go](https://github.com/parquet-go/parquet-go) ecosystem. + +Includes AMD64 assembly optimizations with pure Go fallback for portability. + +```bash +go get github.com/parquet-go/bitpack +``` + +## Usage + +```go +import "github.com/parquet-go/bitpack" + +// Pack int32 values with 3-bit width +values := []int32{1, 2, 3, 4, 5} +bitWidth := uint(3) +packedSize := bitpack.ByteCount(uint(len(values)) * bitWidth) +dst := make([]byte, packedSize+bitpack.PaddingInt32) +bitpack.PackInt32(dst, values, bitWidth) + +// Unpack int32 values +unpacked := make([]int32, len(values)) +bitpack.UnpackInt32(unpacked, dst, bitWidth) +``` + +For complete working examples, see the [examples](./examples) directory. diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/bitpack.go b/vendor/github.com/parquet-go/bitpack/bitpack.go similarity index 62% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/bitpack.go rename to vendor/github.com/parquet-go/bitpack/bitpack.go index e6a11884d6d..4a55472515c 100644 --- a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/bitpack.go +++ b/vendor/github.com/parquet-go/bitpack/bitpack.go @@ -2,6 +2,12 @@ // integers of various bit widths. package bitpack +// Int is a type constraint representing the integer types that this package +// supports. +type Int interface { + ~int32 | ~uint32 | ~int64 | ~uint64 | ~int | ~uintptr +} + // ByteCount returns the number of bytes needed to hold the given bit count. func ByteCount(bitCount uint) int { return int((bitCount + 7) / 8) diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/masks_int32_amd64.s b/vendor/github.com/parquet-go/bitpack/masks_int32_amd64.s similarity index 100% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/masks_int32_amd64.s rename to vendor/github.com/parquet-go/bitpack/masks_int32_amd64.s diff --git a/vendor/github.com/parquet-go/bitpack/pack.go b/vendor/github.com/parquet-go/bitpack/pack.go new file mode 100644 index 00000000000..a30bd8d3ea0 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/pack.go @@ -0,0 +1,19 @@ +package bitpack + +import ( + "unsafe" + + "github.com/parquet-go/bitpack/unsafecast" +) + +// Pack packs values from src to dst, each value is packed into the given +// bit width regardless of how many bits are needed to represent it. +func Pack[T Int](dst []byte, src []T, bitWidth uint) { + _ = dst[:ByteCount(bitWidth*uint(len(src)))] + switch unsafe.Sizeof(T(0)) { + case 4: + packInt32(dst, unsafecast.Slice[int32](src), bitWidth) + default: + packInt64(dst, unsafecast.Slice[int64](src), bitWidth) + } +} diff --git a/vendor/github.com/parquet-go/bitpack/pack_arm64.go b/vendor/github.com/parquet-go/bitpack/pack_arm64.go new file mode 100644 index 00000000000..202950d5d66 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/pack_arm64.go @@ -0,0 +1,31 @@ +//go:build !purego + +package bitpack + +//go:noescape +func packInt32ARM64(dst []byte, src []int32, bitWidth uint) + +//go:noescape +func packInt32NEON(dst []byte, src []int32, bitWidth uint) + +//go:noescape +func packInt64ARM64(dst []byte, src []int64, bitWidth uint) + +//go:noescape +func packInt64NEON(dst []byte, src []int64, bitWidth uint) + +func packInt32(dst []byte, src []int32, bitWidth uint) { + if bitWidth <= 8 { + packInt32NEON(dst, src, bitWidth) + } else { + packInt32ARM64(dst, src, bitWidth) + } +} + +func packInt64(dst []byte, src []int64, bitWidth uint) { + if bitWidth <= 8 { + packInt64NEON(dst, src, bitWidth) + } else { + packInt64ARM64(dst, src, bitWidth) + } +} diff --git a/vendor/github.com/parquet-go/bitpack/pack_int32_arm64.s b/vendor/github.com/parquet-go/bitpack/pack_int32_arm64.s new file mode 100644 index 00000000000..5c47327cd99 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/pack_int32_arm64.s @@ -0,0 +1,462 @@ +//go:build !purego + +#include "funcdata.h" +#include "textflag.h" + +// func packInt32ARM64(dst []byte, src []int32, bitWidth uint) +TEXT ·packInt32ARM64(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD src_base+24(FP), R1 // R1 = src pointer + MOVD src_len+32(FP), R2 // R2 = src length + MOVD bitWidth+48(FP), R3 // R3 = bitWidth + + // Handle bitWidth == 0 + CBZ R3, done + + // R4 = bitMask = (1 << bitWidth) - 1 + MOVD $1, R4 + LSL R3, R4, R4 + SUB $1, R4, R4 + + // R5 = buffer (64-bit accumulator) + // R6 = bufferedBits + // R7 = byteIndex + // R8 = loop counter (src index) + MOVD $0, R5 + MOVD $0, R6 + MOVD $0, R7 + MOVD $0, R8 + + // Main loop: process each value from src +loop: + CMP R2, R8 + BEQ flush_remaining + + // Load value from src[R8] + LSL $2, R8, R16 // R16 = R8 * 4 + MOVWU (R1)(R16), R9 // R9 = src[R8] + + // Mask the value: R9 = value & bitMask + AND R4, R9, R9 + + // Add to buffer: buffer |= (value << bufferedBits) + LSL R6, R9, R10 // R10 = value << bufferedBits + ORR R10, R5, R5 // buffer |= R10 + + // bufferedBits += bitWidth + ADD R3, R6, R6 + + // Increment source index + ADD $1, R8, R8 + +flush_loop: + // While bufferedBits >= 32, flush 32-bit words + CMP $32, R6 + BLT loop + + // Write 32-bit word to dst[byteIndex] + MOVW R5, (R0)(R7) + + // buffer >>= 32 + LSR $32, R5, R5 + + // bufferedBits -= 32 + SUB $32, R6, R6 + + // byteIndex += 4 + ADD $4, R7, R7 + + B flush_loop + +flush_remaining: + // If no bits remaining, we're done + CBZ R6, done + + // Calculate remaining bytes = (bufferedBits + 7) / 8 + ADD $7, R6, R11 + LSR $3, R11, R11 // R11 = remainingBytes + + MOVD $0, R12 // R12 = i (byte counter) + +flush_byte_loop: + CMP R11, R12 + BEQ done + + // dst[byteIndex] = byte(buffer) + MOVB R5, (R0)(R7) + + // buffer >>= 8 + LSR $8, R5, R5 + + // byteIndex++, i++ + ADD $1, R7, R7 + ADD $1, R12, R12 + + B flush_byte_loop + +done: + RET + +// func packInt32NEON(dst []byte, src []int32, bitWidth uint) +TEXT ·packInt32NEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD src_base+24(FP), R1 // R1 = src pointer + MOVD src_len+32(FP), R2 // R2 = src length + MOVD bitWidth+48(FP), R3 // R3 = bitWidth + + // Handle bitWidth == 0 + CBZ R3, neon_done + + // Initialize processed count to 0 + MOVD $0, R5 + + // Check if we have at least 4 values to process with NEON paths + CMP $4, R2 + BLT neon_done // Not enough values, return and let Go wrapper handle it + + // Determine which NEON path to use based on bitWidth + CMP $1, R3 + BEQ neon_1bit + CMP $2, R3 + BEQ neon_2bit + CMP $3, R3 + BEQ neon_3bit + CMP $4, R3 + BEQ neon_4bit + CMP $5, R3 + BEQ neon_5bit + CMP $6, R3 + BEQ neon_6bit + CMP $7, R3 + BEQ neon_7bit + CMP $8, R3 + BEQ neon_8bit + + // For other bit widths, return without processing + // The Go wrapper will call the scalar version + RET + +neon_1bit: + // BitWidth 1: Pack 8 int32 values into 1 byte + MOVD R2, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length + MOVD $0, R5 // R5 = index + CMP $0, R4 + BEQ neon_done + +neon_1bit_loop: + MOVWU (R1), R6 + AND $1, R6, R6 + MOVWU 4(R1), R7 + AND $1, R7, R7 + ORR R7<<1, R6, R6 + MOVWU 8(R1), R7 + AND $1, R7, R7 + ORR R7<<2, R6, R6 + MOVWU 12(R1), R7 + AND $1, R7, R7 + ORR R7<<3, R6, R6 + MOVWU 16(R1), R7 + AND $1, R7, R7 + ORR R7<<4, R6, R6 + MOVWU 20(R1), R7 + AND $1, R7, R7 + ORR R7<<5, R6, R6 + MOVWU 24(R1), R7 + AND $1, R7, R7 + ORR R7<<6, R6, R6 + MOVWU 28(R1), R7 + AND $1, R7, R7 + ORR R7<<7, R6, R6 + MOVB R6, (R0) + ADD $32, R1, R1 + ADD $1, R0, R0 + ADD $8, R5, R5 + CMP R4, R5 + BLT neon_1bit_loop + B neon_done + +neon_2bit: + MOVD R2, R4 + LSR $2, R4, R4 + LSL $2, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_2bit_loop: + MOVWU (R1), R6 + AND $3, R6, R6 + MOVWU 4(R1), R7 + AND $3, R7, R7 + ORR R7<<2, R6, R6 + MOVWU 8(R1), R7 + AND $3, R7, R7 + ORR R7<<4, R6, R6 + MOVWU 12(R1), R7 + AND $3, R7, R7 + ORR R7<<6, R6, R6 + MOVB R6, (R0) + ADD $16, R1, R1 + ADD $1, R0, R0 + ADD $4, R5, R5 + CMP R4, R5 + BLT neon_2bit_loop + B neon_done + +neon_3bit: + MOVD R2, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_3bit_loop: + MOVWU (R1), R6 + AND $7, R6, R6 + MOVWU 4(R1), R7 + AND $7, R7, R7 + ORR R7<<3, R6, R6 + MOVWU 8(R1), R7 + AND $7, R7, R7 + ORR R7<<6, R6, R6 + MOVWU 12(R1), R7 + AND $7, R7, R7 + ORR R7<<9, R6, R6 + MOVWU 16(R1), R7 + AND $7, R7, R7 + ORR R7<<12, R6, R6 + MOVWU 20(R1), R7 + AND $7, R7, R7 + ORR R7<<15, R6, R6 + MOVWU 24(R1), R7 + AND $7, R7, R7 + ORR R7<<18, R6, R6 + MOVWU 28(R1), R7 + AND $7, R7, R7 + ORR R7<<21, R6, R6 + MOVB R6, (R0) + LSR $8, R6, R7 + MOVB R7, 1(R0) + LSR $16, R6, R7 + MOVB R7, 2(R0) + ADD $32, R1, R1 + ADD $3, R0, R0 + ADD $8, R5, R5 + CMP R4, R5 + BLT neon_3bit_loop + B neon_done + +neon_4bit: + MOVD R2, R4 + LSR $2, R4, R4 + LSL $2, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_4bit_loop: + MOVWU (R1), R6 + AND $15, R6, R6 + MOVWU 4(R1), R7 + AND $15, R7, R7 + ORR R7<<4, R6, R6 + MOVWU 8(R1), R7 + AND $15, R7, R7 + ORR R7<<8, R6, R6 + MOVWU 12(R1), R7 + AND $15, R7, R7 + ORR R7<<12, R6, R6 + MOVH R6, (R0) + ADD $16, R1, R1 + ADD $2, R0, R0 + ADD $4, R5, R5 + CMP R4, R5 + BLT neon_4bit_loop + B neon_done + +neon_5bit: + MOVD R2, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_5bit_loop: + MOVD $0, R6 + MOVWU (R1), R7 + AND $31, R7, R7 + ORR R7, R6, R6 + MOVWU 4(R1), R7 + AND $31, R7, R7 + ORR R7<<5, R6, R6 + MOVWU 8(R1), R7 + AND $31, R7, R7 + ORR R7<<10, R6, R6 + MOVWU 12(R1), R7 + AND $31, R7, R7 + ORR R7<<15, R6, R6 + MOVWU 16(R1), R7 + AND $31, R7, R7 + ORR R7<<20, R6, R6 + MOVWU 20(R1), R7 + AND $31, R7, R7 + ORR R7<<25, R6, R6 + MOVWU 24(R1), R7 + AND $31, R7, R7 + ORR R7<<30, R6, R6 + MOVWU 28(R1), R7 + AND $31, R7, R7 + ORR R7<<35, R6, R6 + MOVB R6, (R0) + LSR $8, R6, R7 + MOVB R7, 1(R0) + LSR $16, R6, R7 + MOVB R7, 2(R0) + LSR $24, R6, R7 + MOVB R7, 3(R0) + LSR $32, R6, R7 + MOVB R7, 4(R0) + ADD $32, R1, R1 + ADD $5, R0, R0 + ADD $8, R5, R5 + CMP R4, R5 + BLT neon_5bit_loop + B neon_done + +neon_6bit: + MOVD R2, R4 + LSR $2, R4, R4 + LSL $2, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_6bit_loop: + MOVWU (R1), R6 + AND $63, R6, R6 + MOVWU 4(R1), R7 + AND $63, R7, R7 + ORR R7<<6, R6, R6 + MOVWU 8(R1), R7 + AND $63, R7, R7 + ORR R7<<12, R6, R6 + MOVWU 12(R1), R7 + AND $63, R7, R7 + ORR R7<<18, R6, R6 + MOVB R6, (R0) + LSR $8, R6, R7 + MOVB R7, 1(R0) + LSR $16, R6, R7 + MOVB R7, 2(R0) + ADD $16, R1, R1 + ADD $3, R0, R0 + ADD $4, R5, R5 + CMP R4, R5 + BLT neon_6bit_loop + B neon_done + +neon_7bit: + MOVD R2, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_7bit_loop: + MOVD $0, R6 + MOVWU (R1), R7 + AND $127, R7, R7 + ORR R7, R6, R6 + MOVWU 4(R1), R7 + AND $127, R7, R7 + ORR R7<<7, R6, R6 + MOVWU 8(R1), R7 + AND $127, R7, R7 + ORR R7<<14, R6, R6 + MOVWU 12(R1), R7 + AND $127, R7, R7 + ORR R7<<21, R6, R6 + MOVWU 16(R1), R7 + AND $127, R7, R7 + ORR R7<<28, R6, R6 + MOVWU 20(R1), R7 + AND $127, R7, R7 + ORR R7<<35, R6, R6 + MOVWU 24(R1), R7 + AND $127, R7, R7 + ORR R7<<42, R6, R6 + MOVWU 28(R1), R7 + AND $127, R7, R7 + ORR R7<<49, R6, R6 + MOVB R6, (R0) + LSR $8, R6, R7 + MOVB R7, 1(R0) + LSR $16, R6, R7 + MOVB R7, 2(R0) + LSR $24, R6, R7 + MOVB R7, 3(R0) + LSR $32, R6, R7 + MOVB R7, 4(R0) + LSR $40, R6, R7 + MOVB R7, 5(R0) + LSR $48, R6, R7 + MOVB R7, 6(R0) + ADD $32, R1, R1 + ADD $7, R0, R0 + ADD $8, R5, R5 + CMP R4, R5 + BLT neon_7bit_loop + B neon_done + +neon_8bit: + MOVD R2, R4 + LSR $2, R4, R4 + LSL $2, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_8bit_loop: + MOVWU (R1), R6 + MOVB R6, (R0) + MOVWU 4(R1), R6 + MOVB R6, 1(R0) + MOVWU 8(R1), R6 + MOVB R6, 2(R0) + MOVWU 12(R1), R6 + MOVB R6, 3(R0) + ADD $16, R1, R1 + ADD $4, R0, R0 + ADD $4, R5, R5 + CMP R4, R5 + BLT neon_8bit_loop + +neon_done: + // After NEON processing, handle any remainder with scalar code + // Check if there are remaining values to process + CMP R2, R5 // R5 = processed count, R2 = total length + BGE neon_ret // If processed >= total, we're done + + // Calculate remainder: adjust src/dst pointers and length + // Advance src pointer by (R5 * 4) bytes + LSL $2, R5, R16 + ADD R16, R1, R1 + + // Calculate packed bytes for processed values and advance dst + MUL R3, R5, R16 // R16 = processed * bitWidth (in bits) + LSR $3, R16, R16 // R16 = packed bytes + ADD R16, R0, R0 + + // Update remaining length + SUB R5, R2, R2 + + // Jump to scalar implementation for remainder + B ·packInt32ARM64(SB) + +neon_ret: + RET diff --git a/vendor/github.com/parquet-go/bitpack/pack_int64_arm64.s b/vendor/github.com/parquet-go/bitpack/pack_int64_arm64.s new file mode 100644 index 00000000000..9b4137d110d --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/pack_int64_arm64.s @@ -0,0 +1,514 @@ +//go:build !purego + +#include "funcdata.h" +#include "textflag.h" + +// func packInt64ARM64(dst []byte, src []int64, bitWidth uint) +TEXT ·packInt64ARM64(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD src_base+24(FP), R1 // R1 = src pointer + MOVD src_len+32(FP), R2 // R2 = src length + MOVD bitWidth+48(FP), R3 // R3 = bitWidth + + // Handle bitWidth == 0 + CBZ R3, done + + // Special case: bitWidth == 64 (no packing needed) + CMP $64, R3 + BEQ copy_direct + + // R4 = bitMask = (1 << bitWidth) - 1 + MOVD $1, R4 + LSL R3, R4, R4 + SUB $1, R4, R4 + + // R5 = bufferLo (64-bit accumulator) + // R6 = bufferHi (overflow buffer) + // R7 = bufferedBits + // R8 = byteIndex + // R9 = loop counter (src index) + MOVD $0, R5 + MOVD $0, R6 + MOVD $0, R7 + MOVD $0, R8 + MOVD $0, R9 + + // Main loop: process each value from src +loop: + CMP R2, R9 + BEQ flush_remaining + + // Load value from src[R9] + LSL $3, R9, R16 // R16 = R9 * 8 + MOVD (R1)(R16), R10 // R10 = src[R9] + + // Mask the value: R10 = value & bitMask + AND R4, R10, R10 + + // Check if value fits entirely in low buffer + ADD R3, R7, R11 // R11 = bufferedBits + bitWidth + CMP $64, R11 + BGT spans_buffers + + // Value fits in low buffer + LSL R7, R10, R12 // R12 = value << bufferedBits + ORR R12, R5, R5 // bufferLo |= R12 + MOVD R11, R7 // bufferedBits = R11 + B increment_index + +spans_buffers: + // Value spans low and high buffers + // bitsInLo = 64 - bufferedBits + MOVD $64, R12 + SUB R7, R12, R12 // R12 = bitsInLo + + // bufferLo |= value << bufferedBits + LSL R7, R10, R13 + ORR R13, R5, R5 + + // bufferHi = value >> bitsInLo + LSR R12, R10, R6 + + // bufferedBits += bitWidth + MOVD R11, R7 + +increment_index: + // Increment source index + ADD $1, R9, R9 + +flush_loop: + // While bufferedBits >= 64, flush 64-bit words + CMP $64, R7 + BLT loop + + // Write 64-bit word to dst[byteIndex] + MOVD R5, (R0)(R8) + + // bufferLo = bufferHi + MOVD R6, R5 + + // bufferHi = 0 + MOVD $0, R6 + + // bufferedBits -= 64 + SUB $64, R7, R7 + + // byteIndex += 8 + ADD $8, R8, R8 + + B flush_loop + +flush_remaining: + // If no bits remaining, we're done + CBZ R7, done + + // Calculate remaining bytes = (bufferedBits + 7) / 8 + ADD $7, R7, R11 + LSR $3, R11, R11 // R11 = remainingBytes + + MOVD $0, R12 // R12 = i (byte counter) + +flush_byte_loop: + CMP R11, R12 + BEQ done + + // dst[byteIndex] = byte(bufferLo) + MOVB R5, (R0)(R8) + + // bufferLo >>= 8 + LSR $8, R5, R5 + + // byteIndex++, i++ + ADD $1, R8, R8 + ADD $1, R12, R12 + + B flush_byte_loop + +copy_direct: + // bitWidth == 64: direct copy + MOVD $0, R9 // R9 = index + MOVD $0, R10 // R10 = byte offset + +copy_loop: + CMP R2, R9 + BEQ done + + // Load src[i] + LSL $3, R9, R16 + MOVD (R1)(R16), R11 + + // Store to dst[i*8] + MOVD R11, (R0)(R10) + + // i++, offset += 8 + ADD $1, R9, R9 + ADD $8, R10, R10 + + B copy_loop + +done: + RET + +// func packInt64NEON(dst []byte, src []int64, bitWidth uint) +TEXT ·packInt64NEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD src_base+24(FP), R1 // R1 = src pointer + MOVD src_len+32(FP), R2 // R2 = src length + MOVD bitWidth+48(FP), R3 // R3 = bitWidth + + // Handle bitWidth == 0 + CBZ R3, neon_done + + // Initialize processed count to 0 + MOVD $0, R5 + + // Check if we have at least 4 values to process with NEON paths + CMP $4, R2 + BLT neon_done // Not enough values, return and let Go wrapper handle it + + // Determine which NEON path to use based on bitWidth + CMP $1, R3 + BEQ neon_1bit + CMP $2, R3 + BEQ neon_2bit + CMP $3, R3 + BEQ neon_3bit + CMP $4, R3 + BEQ neon_4bit + CMP $5, R3 + BEQ neon_5bit + CMP $6, R3 + BEQ neon_6bit + CMP $7, R3 + BEQ neon_7bit + CMP $8, R3 + BEQ neon_8bit + + // For other bit widths, return without processing + // The Go wrapper will call the scalar version + RET + +neon_1bit: + // BitWidth 1: Pack 8 int64 values into 1 byte + MOVD R2, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length + MOVD $0, R5 // R5 = index + CMP $0, R4 + BEQ neon_done + +neon_1bit_loop: + MOVD (R1), R6 + AND $1, R6, R6 + MOVD 8(R1), R7 + AND $1, R7, R7 + ORR R7<<1, R6, R6 + MOVD 16(R1), R7 + AND $1, R7, R7 + ORR R7<<2, R6, R6 + MOVD 24(R1), R7 + AND $1, R7, R7 + ORR R7<<3, R6, R6 + MOVD 32(R1), R7 + AND $1, R7, R7 + ORR R7<<4, R6, R6 + MOVD 40(R1), R7 + AND $1, R7, R7 + ORR R7<<5, R6, R6 + MOVD 48(R1), R7 + AND $1, R7, R7 + ORR R7<<6, R6, R6 + MOVD 56(R1), R7 + AND $1, R7, R7 + ORR R7<<7, R6, R6 + MOVB R6, (R0) + ADD $64, R1, R1 + ADD $1, R0, R0 + ADD $8, R5, R5 + CMP R4, R5 + BLT neon_1bit_loop + B neon_done + +neon_2bit: + MOVD R2, R4 + LSR $2, R4, R4 + LSL $2, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_2bit_loop: + MOVD (R1), R6 + AND $3, R6, R6 + MOVD 8(R1), R7 + AND $3, R7, R7 + ORR R7<<2, R6, R6 + MOVD 16(R1), R7 + AND $3, R7, R7 + ORR R7<<4, R6, R6 + MOVD 24(R1), R7 + AND $3, R7, R7 + ORR R7<<6, R6, R6 + MOVB R6, (R0) + ADD $32, R1, R1 + ADD $1, R0, R0 + ADD $4, R5, R5 + CMP R4, R5 + BLT neon_2bit_loop + B neon_done + +neon_3bit: + MOVD R2, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_3bit_loop: + MOVD (R1), R6 + AND $7, R6, R6 + MOVD 8(R1), R7 + AND $7, R7, R7 + ORR R7<<3, R6, R6 + MOVD 16(R1), R7 + AND $7, R7, R7 + ORR R7<<6, R6, R6 + MOVD 24(R1), R7 + AND $7, R7, R7 + ORR R7<<9, R6, R6 + MOVD 32(R1), R7 + AND $7, R7, R7 + ORR R7<<12, R6, R6 + MOVD 40(R1), R7 + AND $7, R7, R7 + ORR R7<<15, R6, R6 + MOVD 48(R1), R7 + AND $7, R7, R7 + ORR R7<<18, R6, R6 + MOVD 56(R1), R7 + AND $7, R7, R7 + ORR R7<<21, R6, R6 + MOVB R6, (R0) + LSR $8, R6, R7 + MOVB R7, 1(R0) + LSR $16, R6, R7 + MOVB R7, 2(R0) + ADD $64, R1, R1 + ADD $3, R0, R0 + ADD $8, R5, R5 + CMP R4, R5 + BLT neon_3bit_loop + B neon_done + +neon_4bit: + MOVD R2, R4 + LSR $2, R4, R4 + LSL $2, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_4bit_loop: + MOVD (R1), R6 + AND $15, R6, R6 + MOVD 8(R1), R7 + AND $15, R7, R7 + ORR R7<<4, R6, R6 + MOVD 16(R1), R7 + AND $15, R7, R7 + ORR R7<<8, R6, R6 + MOVD 24(R1), R7 + AND $15, R7, R7 + ORR R7<<12, R6, R6 + MOVH R6, (R0) + ADD $32, R1, R1 + ADD $2, R0, R0 + ADD $4, R5, R5 + CMP R4, R5 + BLT neon_4bit_loop + B neon_done + +neon_5bit: + MOVD R2, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_5bit_loop: + MOVD $0, R6 + MOVD (R1), R7 + AND $31, R7, R7 + ORR R7, R6, R6 + MOVD 8(R1), R7 + AND $31, R7, R7 + ORR R7<<5, R6, R6 + MOVD 16(R1), R7 + AND $31, R7, R7 + ORR R7<<10, R6, R6 + MOVD 24(R1), R7 + AND $31, R7, R7 + ORR R7<<15, R6, R6 + MOVD 32(R1), R7 + AND $31, R7, R7 + ORR R7<<20, R6, R6 + MOVD 40(R1), R7 + AND $31, R7, R7 + ORR R7<<25, R6, R6 + MOVD 48(R1), R7 + AND $31, R7, R7 + ORR R7<<30, R6, R6 + MOVD 56(R1), R7 + AND $31, R7, R7 + ORR R7<<35, R6, R6 + MOVB R6, (R0) + LSR $8, R6, R7 + MOVB R7, 1(R0) + LSR $16, R6, R7 + MOVB R7, 2(R0) + LSR $24, R6, R7 + MOVB R7, 3(R0) + LSR $32, R6, R7 + MOVB R7, 4(R0) + ADD $64, R1, R1 + ADD $5, R0, R0 + ADD $8, R5, R5 + CMP R4, R5 + BLT neon_5bit_loop + B neon_done + +neon_6bit: + MOVD R2, R4 + LSR $2, R4, R4 + LSL $2, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_6bit_loop: + MOVD (R1), R6 + AND $63, R6, R6 + MOVD 8(R1), R7 + AND $63, R7, R7 + ORR R7<<6, R6, R6 + MOVD 16(R1), R7 + AND $63, R7, R7 + ORR R7<<12, R6, R6 + MOVD 24(R1), R7 + AND $63, R7, R7 + ORR R7<<18, R6, R6 + MOVB R6, (R0) + LSR $8, R6, R7 + MOVB R7, 1(R0) + LSR $16, R6, R7 + MOVB R7, 2(R0) + ADD $32, R1, R1 + ADD $3, R0, R0 + ADD $4, R5, R5 + CMP R4, R5 + BLT neon_6bit_loop + B neon_done + +neon_7bit: + MOVD R2, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_7bit_loop: + MOVD $0, R6 + MOVD (R1), R7 + AND $127, R7, R7 + ORR R7, R6, R6 + MOVD 8(R1), R7 + AND $127, R7, R7 + ORR R7<<7, R6, R6 + MOVD 16(R1), R7 + AND $127, R7, R7 + ORR R7<<14, R6, R6 + MOVD 24(R1), R7 + AND $127, R7, R7 + ORR R7<<21, R6, R6 + MOVD 32(R1), R7 + AND $127, R7, R7 + ORR R7<<28, R6, R6 + MOVD 40(R1), R7 + AND $127, R7, R7 + ORR R7<<35, R6, R6 + MOVD 48(R1), R7 + AND $127, R7, R7 + ORR R7<<42, R6, R6 + MOVD 56(R1), R7 + AND $127, R7, R7 + ORR R7<<49, R6, R6 + MOVB R6, (R0) + LSR $8, R6, R7 + MOVB R7, 1(R0) + LSR $16, R6, R7 + MOVB R7, 2(R0) + LSR $24, R6, R7 + MOVB R7, 3(R0) + LSR $32, R6, R7 + MOVB R7, 4(R0) + LSR $40, R6, R7 + MOVB R7, 5(R0) + LSR $48, R6, R7 + MOVB R7, 6(R0) + ADD $64, R1, R1 + ADD $7, R0, R0 + ADD $8, R5, R5 + CMP R4, R5 + BLT neon_7bit_loop + B neon_done + +neon_8bit: + MOVD R2, R4 + LSR $2, R4, R4 + LSL $2, R4, R4 + MOVD $0, R5 + CMP $0, R4 + BEQ neon_done + +neon_8bit_loop: + MOVD (R1), R6 + MOVB R6, (R0) + MOVD 8(R1), R6 + MOVB R6, 1(R0) + MOVD 16(R1), R6 + MOVB R6, 2(R0) + MOVD 24(R1), R6 + MOVB R6, 3(R0) + ADD $32, R1, R1 + ADD $4, R0, R0 + ADD $4, R5, R5 + CMP R4, R5 + BLT neon_8bit_loop + +neon_done: + // After NEON processing, handle any remainder with scalar code + // Check if there are remaining values to process + CMP R2, R5 // R5 = processed count, R2 = total length + BGE neon_ret // If processed >= total, we're done + + // Calculate remainder: adjust src/dst pointers and length + // Advance src pointer by (R5 * 8) bytes + LSL $3, R5, R16 + ADD R16, R1, R1 + + // Calculate packed bytes for processed values and advance dst + MUL R3, R5, R16 // R16 = processed * bitWidth (in bits) + LSR $3, R16, R16 // R16 = packed bytes + ADD R16, R0, R0 + + // Update remaining length + SUB R5, R2, R2 + + // Jump to scalar implementation for remainder + B ·packInt64ARM64(SB) + +neon_ret: + RET diff --git a/vendor/github.com/parquet-go/bitpack/pack_purego.go b/vendor/github.com/parquet-go/bitpack/pack_purego.go new file mode 100644 index 00000000000..8d02d75ffbc --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/pack_purego.go @@ -0,0 +1,94 @@ +//go:build purego || !arm64 + +package bitpack + +import "encoding/binary" + +func packInt32(dst []byte, src []int32, bitWidth uint) { + if bitWidth == 0 { + return + } + + bitMask := uint32(1<= 32 { + binary.LittleEndian.PutUint32(dst[byteIndex:], uint32(buffer)) + buffer >>= 32 + bufferedBits -= 32 + byteIndex += 4 + } + } + + // Flush remaining bits + if bufferedBits > 0 { + // Only write the bytes we need + remainingBytes := (bufferedBits + 7) / 8 + for i := uint(0); i < remainingBytes; i++ { + dst[byteIndex] = byte(buffer) + buffer >>= 8 + byteIndex++ + } + } +} + +func packInt64(dst []byte, src []int64, bitWidth uint) { + if bitWidth == 0 { + return + } + if bitWidth == 64 { + // Special case: no packing needed, direct copy + for i, v := range src { + binary.LittleEndian.PutUint64(dst[i*8:], uint64(v)) + } + return + } + + bitMask := uint64(1<> bitsInLo + bufferedBits += bitWidth + } + + // Flush complete 64-bit words + for bufferedBits >= 64 { + binary.LittleEndian.PutUint64(dst[byteIndex:], bufferLo) + bufferLo = bufferHi + bufferHi = 0 + bufferedBits -= 64 + byteIndex += 8 + } + } + + // Flush remaining bits + if bufferedBits > 0 { + remainingBytes := (bufferedBits + 7) / 8 + for i := uint(0); i < remainingBytes; i++ { + dst[byteIndex] = byte(bufferLo) + bufferLo >>= 8 + byteIndex++ + } + } +} diff --git a/vendor/github.com/parquet-go/bitpack/unpack.go b/vendor/github.com/parquet-go/bitpack/unpack.go new file mode 100644 index 00000000000..2396e0dd960 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack.go @@ -0,0 +1,29 @@ +package bitpack + +import ( + "unsafe" + + "github.com/parquet-go/bitpack/unsafecast" +) + +// PaddingInt32 is the padding expected to exist after the end of input buffers +// for the UnpackInt32 algorithm to avoid reading beyond the end of the input. +const PaddingInt32 = 16 + +// PaddingInt64 is the padding expected to exist after the end of input buffers +// for the UnpackInt32 algorithm to avoid reading beyond the end of the input. +const PaddingInt64 = 32 + +// Unpack unpacks values from src to dst, each value is unpacked from the given +// bit width regardless of how many bits are needed to represent it. +func Unpack[T Int](dst []T, src []byte, bitWidth uint) { + sizeofT := uint(unsafe.Sizeof(T(0))) + padding := (8 * sizeofT) / 2 // 32 bits => 16, 64 bits => 32 + _ = src[:ByteCount(bitWidth*uint(len(dst))+8*padding)] + switch sizeofT { + case 4: + unpackInt32(unsafecast.Slice[int32](dst), src, bitWidth) + default: + unpackInt64(unsafecast.Slice[int64](dst), src, bitWidth) + } +} diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int32_1bit_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int32_1bit_arm64.s new file mode 100644 index 00000000000..6f6cb28379c --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_1bit_arm64.s @@ -0,0 +1,184 @@ +//go:build !purego + +#include "textflag.h" +#include "unpack_neon_macros_arm64.h" + +// unpackInt32x1bitNEON implements NEON unpacking for bitWidth=1 using direct bit manipulation +// Each byte contains 8 bits: [bit7][bit6][bit5][bit4][bit3][bit2][bit1][bit0] +// +// func unpackInt32x1bitNEON(dst []int32, src []byte, bitWidth uint) +TEXT ·unpackInt32x1bitNEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth (should be 1) + + MOVD $0, R5 // R5 = index (initialize early for tail path) + + // Check if we have at least 64 values to process + CMP $64, R1 + BLT neon1_tail + + // Round down to multiple of 64 for NEON processing + MOVD R1, R4 + LSR $6, R4, R4 // R4 = len / 64 + LSL $6, R4, R4 // R4 = aligned length (multiple of 64) + + // Load mask for 1 bit (0x01010101...) + MOVD $0x0101010101010101, R6 + VMOV R6, V31.D[0] + VMOV R6, V31.D[1] // V31 = mask for single bits + +neon1_loop: + // Load 8 bytes (contains 64 x 1-bit values) + VLD1 (R2), [V0.B8] + + // Extract each bit position (8 separate streams) + VAND V31.B16, V0.B16, V1.B16 // V1 = bit 0 + + VUSHR $1, V0.B16, V2.B16 + VAND V31.B16, V2.B16, V2.B16 // V2 = bit 1 + + VUSHR $2, V0.B16, V3.B16 + VAND V31.B16, V3.B16, V3.B16 // V3 = bit 2 + + VUSHR $3, V0.B16, V4.B16 + VAND V31.B16, V4.B16, V4.B16 // V4 = bit 3 + + VUSHR $4, V0.B16, V5.B16 + VAND V31.B16, V5.B16, V5.B16 // V5 = bit 4 + + VUSHR $5, V0.B16, V6.B16 + VAND V31.B16, V6.B16, V6.B16 // V6 = bit 5 + + VUSHR $6, V0.B16, V7.B16 + VAND V31.B16, V7.B16, V7.B16 // V7 = bit 6 + + VUSHR $7, V0.B16, V8.B16 + VAND V31.B16, V8.B16, V8.B16 // V8 = bit 7 + + // Stage 1: ZIP pairs (8 streams → 4 streams of pairs) + VZIP1 V2.B8, V1.B8, V9.B8 // V9 = [bit0,bit1] interleaved + VZIP1 V4.B8, V3.B8, V10.B8 // V10 = [bit2,bit3] interleaved + VZIP1 V6.B8, V5.B8, V11.B8 // V11 = [bit4,bit5] interleaved + VZIP1 V8.B8, V7.B8, V12.B8 // V12 = [bit6,bit7] interleaved + + VZIP2 V2.B8, V1.B8, V13.B8 // V13 = [bit0,bit1] upper half + VZIP2 V4.B8, V3.B8, V14.B8 // V14 = [bit2,bit3] upper half + VZIP2 V6.B8, V5.B8, V15.B8 // V15 = [bit4,bit5] upper half + VZIP2 V8.B8, V7.B8, V16.B8 // V16 = [bit6,bit7] upper half + + // Stage 2: ZIP quads (4 streams → 2 streams of quads) + VZIP1 V10.H4, V9.H4, V17.H4 // V17 = [0,1,2,3] interleaved + VZIP1 V12.H4, V11.H4, V18.H4 // V18 = [4,5,6,7] interleaved + VZIP2 V10.H4, V9.H4, V19.H4 // V19 = [0,1,2,3] next + VZIP2 V12.H4, V11.H4, V20.H4 // V20 = [4,5,6,7] next + + VZIP1 V14.H4, V13.H4, V21.H4 // V21 = upper [0,1,2,3] + VZIP1 V16.H4, V15.H4, V22.H4 // V22 = upper [4,5,6,7] + VZIP2 V14.H4, V13.H4, V23.H4 // V23 = upper [0,1,2,3] next + VZIP2 V16.H4, V15.H4, V24.H4 // V24 = upper [4,5,6,7] next + + // Stage 3: ZIP octets (2 streams → fully sequential) + VZIP1 V18.S2, V17.S2, V25.S2 // V25 = values 0-7 + VZIP2 V18.S2, V17.S2, V26.S2 // V26 = values 8-15 + VZIP1 V20.S2, V19.S2, V27.S2 // V27 = values 16-23 + VZIP2 V20.S2, V19.S2, V28.S2 // V28 = values 24-31 + VZIP1 V22.S2, V21.S2, V1.S2 // V1 = values 32-39 + VZIP2 V22.S2, V21.S2, V2.S2 // V2 = values 40-47 + VZIP1 V24.S2, V23.S2, V3.S2 // V3 = values 48-55 + VZIP2 V24.S2, V23.S2, V4.S2 // V4 = values 56-63 + + // Widen to int32 and store - Process first 32 values + USHLL_8H_8B(5, 25) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + + USHLL_8H_8B(5, 26) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + + USHLL_8H_8B(5, 27) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + + USHLL_8H_8B(5, 28) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + + // Process second 32 values + USHLL_8H_8B(5, 1) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + + USHLL_8H_8B(5, 2) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + + USHLL_8H_8B(5, 3) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + + USHLL_8H_8B(5, 4) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $64, R5, R5 // index += 64 + + CMP R4, R5 + BLT neon1_loop + +neon1_tail: + // Handle remaining elements with scalar fallback + CMP R1, R5 + BEQ neon1_done + + // Compute remaining elements + SUB R5, R1, R1 + + // Fall back to scalar unpack for tail + MOVD $1, R4 // bitMask = 1 + MOVD $0, R6 // bitOffset = 0 + MOVD $0, R7 // index = 0 + B neon1_scalar_test + +neon1_scalar_loop: + MOVD R6, R8 + LSR $3, R8, R8 // byte_index = bitOffset / 8 + MOVBU (R2)(R8), R9 // Load byte + + MOVD R6, R10 + AND $7, R10, R10 // bit_offset = bitOffset % 8 + + LSR R10, R9, R9 // Shift right by bit offset + AND $1, R9, R9 // Mask to get bit + MOVW R9, (R0) // Store as int32 + + ADD $4, R0, R0 // dst++ + ADD $1, R6, R6 // bitOffset++ + ADD $1, R7, R7 // index++ + +neon1_scalar_test: + CMP R1, R7 + BLT neon1_scalar_loop + +neon1_done: + RET diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int32_2bit_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int32_2bit_arm64.s new file mode 100644 index 00000000000..7acea640e75 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_2bit_arm64.s @@ -0,0 +1,136 @@ +//go:build !purego + +#include "textflag.h" +#include "unpack_neon_macros_arm64.h" + +// unpackInt32x2bitNEON implements NEON unpacking for bitWidth=2 using direct bit manipulation +// Each byte contains 4 values of 2 bits each: [bits 6-7][bits 4-5][bits 2-3][bits 0-1] +// +// func unpackInt32x2bitNEON(dst []int32, src []byte, bitWidth uint) +TEXT ·unpackInt32x2bitNEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth (should be 2) + + MOVD $0, R5 // R5 = index (initialize early for tail path) + + // Check if we have at least 32 values to process + CMP $32, R1 + BLT neon2_tail + + // Round down to multiple of 32 for NEON processing + MOVD R1, R4 + LSR $5, R4, R4 // R4 = len / 32 + LSL $5, R4, R4 // R4 = aligned length (multiple of 32) + + // Load mask for 2 bits (0x03030303...) + MOVD $0x0303030303030303, R6 + VMOV R6, V31.D[0] + VMOV R6, V31.D[1] // V31 = mask for 2-bit values + +neon2_loop: + // Load 8 bytes (contains 32 x 2-bit values) + VLD1 (R2), [V0.B8] + + // Extract bits [1:0] from each byte (values at positions 0,4,8,12,...) + VAND V31.B16, V0.B16, V1.B16 + + // Extract bits [3:2] from each byte (values at positions 1,5,9,13,...) + VUSHR $2, V0.B16, V2.B16 + VAND V31.B16, V2.B16, V2.B16 + + // Extract bits [5:4] from each byte (values at positions 2,6,10,14,...) + VUSHR $4, V0.B16, V3.B16 + VAND V31.B16, V3.B16, V3.B16 + + // Extract bits [7:6] from each byte (values at positions 3,7,11,15,...) + VUSHR $6, V0.B16, V4.B16 + VAND V31.B16, V4.B16, V4.B16 + + // Interleave using two stages of ZIP operations + // Stage 1: ZIP pairs at byte level + VZIP1 V2.B8, V1.B8, V5.B8 // V5 = [V1[0],V2[0],V1[1],V2[1],V1[2],V2[2],V1[3],V2[3]] + VZIP1 V4.B8, V3.B8, V6.B8 // V6 = [V3[0],V4[0],V3[1],V4[1],V3[2],V4[2],V3[3],V4[3]] + VZIP2 V2.B8, V1.B8, V7.B8 // V7 = [V1[4],V2[4],V1[5],V2[5],V1[6],V2[6],V1[7],V2[7]] + VZIP2 V4.B8, V3.B8, V8.B8 // V8 = [V3[4],V4[4],V3[5],V4[5],V3[6],V4[6],V3[7],V4[7]] + + // Stage 2: ZIP quads at 16-bit level to get final sequential order + VZIP1 V6.H4, V5.H4, V13.H4 // V13 = [V1[0],V2[0],V3[0],V4[0],V1[1],V2[1],V3[1],V4[1]] = values 0-7 + VZIP2 V6.H4, V5.H4, V14.H4 // V14 = [V1[2],V2[2],V3[2],V4[2],V1[3],V2[3],V3[3],V4[3]] = values 8-15 + VZIP1 V8.H4, V7.H4, V15.H4 // V15 = [V1[4],V2[4],V3[4],V4[4],V1[5],V2[5],V3[5],V4[5]] = values 16-23 + VZIP2 V8.H4, V7.H4, V16.H4 // V16 = [V1[6],V2[6],V3[6],V4[6],V1[7],V2[7],V3[7],V4[7]] = values 24-31 + + // Widen first 8 values (V13) to int32 + USHLL_8H_8B(17, 13) // V17.8H ← V13.8B + USHLL_4S_4H(18, 17) // V18.4S ← V17.4H (values 0-3) + USHLL2_4S_8H(19, 17) // V19.4S ← V17.8H (values 4-7) + + // Widen second 8 values (V14) to int32 + USHLL_8H_8B(20, 14) // V20.8H ← V14.8B + USHLL_4S_4H(21, 20) // V21.4S ← V20.4H (values 8-11) + USHLL2_4S_8H(22, 20) // V22.4S ← V20.8H (values 12-15) + + // Widen third 8 values (V15) to int32 + USHLL_8H_8B(23, 15) // V23.8H ← V15.8B + USHLL_4S_4H(24, 23) // V24.4S ← V23.4H (values 16-19) + USHLL2_4S_8H(25, 23) // V25.4S ← V23.8H (values 20-23) + + // Widen fourth 8 values (V16) to int32 + USHLL_8H_8B(26, 16) // V26.8H ← V16.8B + USHLL_4S_4H(27, 26) // V27.4S ← V26.4H (values 24-27) + USHLL2_4S_8H(28, 26) // V28.4S ← V26.8H (values 28-31) + + // Store 32 int32 values (128 bytes) + VST1 [V18.S4, V19.S4], (R0) + ADD $32, R0, R0 + VST1 [V21.S4, V22.S4], (R0) + ADD $32, R0, R0 + VST1 [V24.S4, V25.S4], (R0) + ADD $32, R0, R0 + VST1 [V27.S4, V28.S4], (R0) + ADD $32, R0, R0 + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $32, R5, R5 // index += 32 + + CMP R4, R5 + BLT neon2_loop + +neon2_tail: + // Handle remaining elements with scalar fallback + CMP R1, R5 + BEQ neon2_done + + // Compute remaining elements + SUB R5, R1, R1 + + // Fall back to scalar unpack for tail + MOVD $3, R4 // bitMask = 3 (0b11 for 2 bits) + MOVD $0, R6 // bitOffset = 0 + MOVD $0, R7 // index = 0 + B neon2_scalar_test + +neon2_scalar_loop: + MOVD R6, R8 + LSR $3, R8, R8 // byte_index = bitOffset / 8 + MOVBU (R2)(R8), R9 // Load byte + + MOVD R6, R10 + AND $7, R10, R10 // bit_offset = bitOffset % 8 + + LSR R10, R9, R9 // Shift right by bit offset + AND $3, R9, R9 // Mask to get 2 bits + MOVW R9, (R0) // Store as int32 + + ADD $4, R0, R0 // dst++ + ADD $2, R6, R6 // bitOffset += 2 + ADD $1, R7, R7 // index++ + +neon2_scalar_test: + CMP R1, R7 + BLT neon2_scalar_loop + +neon2_done: + RET diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int32_4bit_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int32_4bit_arm64.s new file mode 100644 index 00000000000..05360b4a467 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_4bit_arm64.s @@ -0,0 +1,106 @@ +//go:build !purego + +#include "textflag.h" +#include "unpack_neon_macros_arm64.h" + +// unpackInt32x4bitNEON implements NEON unpacking for bitWidth=4 using direct bit manipulation +// Each byte contains 2 values of 4 bits each +// +// func unpackInt32x4bitNEON(dst []int32, src []byte, bitWidth uint) +TEXT ·unpackInt32x4bitNEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth (should be 4) + + MOVD $0, R5 // R5 = index (initialize early for tail path) + + // Check if we have at least 16 values to process + CMP $16, R1 + BLT neon4_tail + + // Round down to multiple of 16 for NEON processing + MOVD R1, R4 + LSR $4, R4, R4 // R4 = len / 16 + LSL $4, R4, R4 // R4 = aligned length (multiple of 16) + + // Load mask for 4 bits (0x0F0F0F0F...) + MOVD $0x0F0F0F0F0F0F0F0F, R6 + VMOV R6, V31.D[0] + VMOV R6, V31.D[1] // V31 = mask for low nibbles + +neon4_loop: + // Load 8 bytes (contains 16 x 4-bit values) + VLD1 (R2), [V0.B8] + + // Extract low nibbles (values at even nibble positions) + VAND V31.B16, V0.B16, V1.B16 // V1 = low nibbles + + // Extract high nibbles (values at odd nibble positions) + VUSHR $4, V0.B16, V2.B16 // V2 = high nibbles (shifted down) + VAND V31.B16, V2.B16, V2.B16 // V2 = high nibbles (masked) + + // Now V1 has values [0,2,4,6,8,10,12,14] and V2 has [1,3,5,7,9,11,13,15] + // We need to interleave them: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + VZIP1 V2.B8, V1.B8, V3.B8 // V3 = interleaved low half + VZIP2 V2.B8, V1.B8, V4.B8 // V4 = interleaved high half + + // Widen first 8 values (V3) to int32 + USHLL_8H_8B(5, 3) // V5.8H ← V3.8B + USHLL_4S_4H(6, 5) // V6.4S ← V5.4H (values 0-3) + USHLL2_4S_8H(7, 5) // V7.4S ← V5.8H (values 4-7) + + // Widen second 8 values (V4) to int32 + USHLL_8H_8B(8, 4) // V8.8H ← V4.8B + USHLL_4S_4H(9, 8) // V9.4S ← V8.4H (values 8-11) + USHLL2_4S_8H(10, 8) // V10.4S ← V8.8H (values 12-15) + + // Store 16 int32 values (64 bytes) + VST1 [V6.S4, V7.S4], (R0) + ADD $32, R0, R0 + VST1 [V9.S4, V10.S4], (R0) + ADD $32, R0, R0 + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $16, R5, R5 // index += 16 + + CMP R4, R5 + BLT neon4_loop + +neon4_tail: + // Handle remaining elements with scalar fallback + CMP R1, R5 + BEQ neon4_done + + // Compute remaining elements + SUB R5, R1, R1 + + // Fall back to scalar unpack for tail + MOVD $0x0F, R4 // bitMask = 0x0F (4 bits) + MOVD $0, R6 // bitOffset = 0 (start from current R2 position) + MOVD $0, R7 // loop counter = 0 + B neon4_scalar_test + +neon4_scalar_loop: + MOVD R6, R8 + LSR $3, R8, R8 // byte_index = bitOffset / 8 + MOVBU (R2)(R8), R9 // Load byte from current position + + MOVD R6, R10 + AND $7, R10, R10 // bit_offset = bitOffset % 8 + + LSR R10, R9, R9 // Shift right by bit offset + AND $0x0F, R9, R9 // Mask to get 4 bits + MOVW R9, (R0) // Store as int32 + + ADD $4, R0, R0 // dst++ + ADD $4, R6, R6 // bitOffset += 4 + ADD $1, R7, R7 // counter++ + +neon4_scalar_test: + CMP R1, R7 + BLT neon4_scalar_loop + +neon4_done: + RET diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int32_8bit_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int32_8bit_arm64.s new file mode 100644 index 00000000000..40d5bf088a3 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_8bit_arm64.s @@ -0,0 +1,65 @@ +//go:build !purego + +#include "textflag.h" +#include "unpack_neon_macros_arm64.h" + +// unpackInt32x8bitNEON implements NEON unpacking for bitWidth=8 +// Each byte is already a complete value - just widen to int32 +// Processes 8 values at a time using NEON +// +// func unpackInt32x8bitNEON(dst []int32, src []byte, bitWidth uint) +TEXT ·unpackInt32x8bitNEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth (should be 8) + + MOVD $0, R5 // R5 = index + + // Check if we have at least 8 values to process + CMP $8, R1 + BLT tbl8_tail + + // Round down to multiple of 8 for NEON processing + MOVD R1, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length (multiple of 8) + +tbl8_loop: + // Load 8 bytes (8 x 8-bit values) + VLD1 (R2), [V0.B8] + + // Widen to int32: byte → short → int + USHLL_8H_8B(1, 0) // V1.8H ← V0.8B (8x8-bit → 8x16-bit) + USHLL_4S_4H(2, 1) // V2.4S ← V1.4H (lower 4x16-bit → 4x32-bit) + USHLL2_4S_8H(3, 1) // V3.4S ← V1.8H (upper 4x16-bit → 4x32-bit) + + // Store 8 int32 values + VST1 [V2.S4, V3.S4], (R0) + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $32, R0, R0 // dst += 8 int32 (32 bytes) + ADD $8, R5, R5 // index += 8 + + CMP R4, R5 + BLT tbl8_loop + +tbl8_tail: + // Handle remaining elements (0-7) one by one + CMP R1, R5 + BGE tbl8_done + +tbl8_tail_loop: + MOVBU (R2), R6 // Load byte + MOVW R6, (R0) // Store as int32 (zero-extended) + + ADD $1, R2, R2 // src++ + ADD $4, R0, R0 // dst++ + ADD $1, R5, R5 // index++ + + CMP R1, R5 + BLT tbl8_tail_loop + +tbl8_done: + RET diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_amd64.go b/vendor/github.com/parquet-go/bitpack/unpack_int32_amd64.go similarity index 93% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_amd64.go rename to vendor/github.com/parquet-go/bitpack/unpack_int32_amd64.go index f3932223919..8783f3f82d1 100644 --- a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_amd64.go +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_amd64.go @@ -3,7 +3,7 @@ package bitpack import ( - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" "golang.org/x/sys/cpu" ) diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_amd64.s b/vendor/github.com/parquet-go/bitpack/unpack_int32_amd64.s similarity index 100% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_amd64.s rename to vendor/github.com/parquet-go/bitpack/unpack_int32_amd64.s diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int32_arm64.go b/vendor/github.com/parquet-go/bitpack/unpack_int32_arm64.go new file mode 100644 index 00000000000..e73d1fdf71e --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_arm64.go @@ -0,0 +1,48 @@ +//go:build !purego + +package bitpack + +import ( + "github.com/parquet-go/bitpack/unsafecast" +) + +//go:noescape +func unpackInt32Default(dst []int32, src []byte, bitWidth uint) + +//go:noescape +func unpackInt32x1to16bitsARM64(dst []int32, src []byte, bitWidth uint) + +//go:noescape +func unpackInt32x1bitNEON(dst []int32, src []byte, bitWidth uint) + +//go:noescape +func unpackInt32x2bitNEON(dst []int32, src []byte, bitWidth uint) + +//go:noescape +func unpackInt32x3bitNEON(dst []int32, src []byte, bitWidth uint) + +//go:noescape +func unpackInt32x4bitNEON(dst []int32, src []byte, bitWidth uint) + +//go:noescape +func unpackInt32x8bitNEON(dst []int32, src []byte, bitWidth uint) + +func unpackInt32(dst []int32, src []byte, bitWidth uint) { + switch { + case bitWidth == 1: + unpackInt32x1bitNEON(dst, src, bitWidth) + case bitWidth == 2: + unpackInt32x2bitNEON(dst, src, bitWidth) + case bitWidth == 4: + unpackInt32x4bitNEON(dst, src, bitWidth) + case bitWidth == 8: + unpackInt32x8bitNEON(dst, src, bitWidth) + // bitWidth == 3,5,6,7: Skip NEON table (don't divide evenly into 8) + case bitWidth <= 16: + unpackInt32x1to16bitsARM64(dst, src, bitWidth) + case bitWidth == 32: + copy(dst, unsafecast.Slice[int32](src)) + default: + unpackInt32Default(dst, src, bitWidth) + } +} diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int32_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int32_arm64.s new file mode 100644 index 00000000000..d3c0bda5486 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_arm64.s @@ -0,0 +1,732 @@ +//go:build !purego + +#include "funcdata.h" +#include "textflag.h" + +// func unpackInt32Default(dst []int32, src []byte, bitWidth uint) +TEXT ·unpackInt32Default(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth + + MOVD $1, R4 // R4 = bitMask = (1 << bitWidth) - 1 + LSL R3, R4, R4 + SUB $1, R4, R4 + + MOVD $0, R5 // R5 = bitOffset + MOVD $0, R6 // R6 = index + B test + +loop: + MOVD R5, R7 // R7 = i = bitOffset / 32 + LSR $5, R7, R7 + + MOVD R5, R8 // R8 = j = bitOffset % 32 + AND $31, R8, R8 + + LSL $2, R7, R16 // R16 = i * 4 + MOVWU (R2)(R16), R9 // R9 = src[i] + MOVW R4, R10 // R10 = bitMask + LSL R8, R10, R10 // R10 = bitMask << j + AND R10, R9, R9 // R9 = src[i] & (bitMask << j) + LSR R8, R9, R9 // R9 = d = (src[i] & (bitMask << j)) >> j + + ADD R3, R8, R11 // R11 = j + bitWidth + CMP $32, R11 + BLE next // if j+bitWidth <= 32, skip to next + + ADD $1, R7, R12 // R12 = i + 1 + LSL $2, R12, R16 // R16 = (i + 1) * 4 + MOVWU (R2)(R16), R13 // R13 = src[i+1] + + MOVD $32, R14 // R14 = k = 32 - j + SUB R8, R14, R14 + + MOVW R4, R15 // R15 = bitMask + LSR R14, R15, R15 // R15 = bitMask >> k + AND R15, R13, R13 // R13 = src[i+1] & (bitMask >> k) + LSL R14, R13, R13 // R13 = (src[i+1] & (bitMask >> k)) << k + ORR R13, R9, R9 // R9 = d | c + +next: + LSL $2, R6, R16 // R16 = index * 4 + MOVW R9, (R0)(R16) // dst[index] = d + ADD R3, R5, R5 // bitOffset += bitWidth + ADD $1, R6, R6 // index++ + +test: + CMP R1, R6 + BNE loop + RET + +// unpackInt32x1to16bitsARM64 implements optimized unpacking for bit widths 1-16 +// Uses optimized scalar ARM64 operations with batched processing +// +// func unpackInt32x1to16bitsARM64(dst []int32, src []byte, bitWidth uint) +TEXT ·unpackInt32x1to16bitsARM64(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth + + // Check if we have at least 4 values to process + CMP $4, R1 + BLT scalar_fallback + + // Determine which NEON path to use based on bitWidth + CMP $1, R3 + BEQ neon_1bit + CMP $2, R3 + BEQ neon_2bit + CMP $3, R3 + BEQ neon_3bit + CMP $4, R3 + BEQ neon_4bit + CMP $5, R3 + BEQ neon_5bit + CMP $6, R3 + BEQ neon_6bit + CMP $7, R3 + BEQ neon_7bit + CMP $8, R3 + BEQ neon_8bit + CMP $16, R3 + BEQ neon_16bit + + // For other bit widths, fall back to scalar + B scalar_fallback + +neon_1bit: + // BitWidth 1: 8 int32 values packed in 1 byte + // Process 8 values at a time using scalar operations + + // Round down to multiple of 8 for processing + MOVD R1, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length (multiple of 8) + + MOVD $0, R5 // R5 = index + CMP $0, R4 + BEQ scalar_fallback + +neon_1bit_loop: + // Load 1 byte (contains 8 values, 1 bit each) + MOVBU (R2), R6 + + // Extract 8 values manually (bits 0-7) + // Value 0: bit 0 + AND $1, R6, R7 + MOVW R7, (R0) + + // Value 1: bit 1 + LSR $1, R6, R7 + AND $1, R7, R7 + MOVW R7, 4(R0) + + // Value 2: bit 2 + LSR $2, R6, R7 + AND $1, R7, R7 + MOVW R7, 8(R0) + + // Value 3: bit 3 + LSR $3, R6, R7 + AND $1, R7, R7 + MOVW R7, 12(R0) + + // Value 4: bit 4 + LSR $4, R6, R7 + AND $1, R7, R7 + MOVW R7, 16(R0) + + // Value 5: bit 5 + LSR $5, R6, R7 + AND $1, R7, R7 + MOVW R7, 20(R0) + + // Value 6: bit 6 + LSR $6, R6, R7 + AND $1, R7, R7 + MOVW R7, 24(R0) + + // Value 7: bit 7 + LSR $7, R6, R7 + AND $1, R7, R7 + MOVW R7, 28(R0) + + // Advance pointers + ADD $1, R2, R2 // src += 1 byte (8 values) + ADD $32, R0, R0 // dst += 8 int32 (32 bytes) + ADD $8, R5, R5 // index += 8 + + CMP R4, R5 + BLT neon_1bit_loop + + CMP R1, R5 + BEQ neon_done + SUB R5, R1, R1 + B scalar_fallback_entry + +neon_2bit: + // BitWidth 2: 4 int32 values packed in 1 byte + // Process 4 values at a time using scalar operations + + MOVD R1, R4 + LSR $2, R4, R4 // R4 = len / 4 + LSL $2, R4, R4 // R4 = aligned length (multiple of 4) + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback + +neon_2bit_loop: + // Load 1 byte (contains 4 values, 2 bits each) + MOVBU (R2), R6 + + // Extract 4 values manually (bits 0-1, 2-3, 4-5, 6-7) + // Value 0: bits 0-1 + AND $3, R6, R7 + MOVW R7, (R0) + + // Value 1: bits 2-3 + LSR $2, R6, R7 + AND $3, R7, R7 + MOVW R7, 4(R0) + + // Value 2: bits 4-5 + LSR $4, R6, R7 + AND $3, R7, R7 + MOVW R7, 8(R0) + + // Value 3: bits 6-7 + LSR $6, R6, R7 + AND $3, R7, R7 + MOVW R7, 12(R0) + + // Advance pointers + ADD $1, R2, R2 // src += 1 byte (4 values) + ADD $16, R0, R0 // dst += 4 int32 (16 bytes) + ADD $4, R5, R5 // index += 4 + + CMP R4, R5 + BLT neon_2bit_loop + + CMP R1, R5 + BEQ neon_done + SUB R5, R1, R1 + B scalar_fallback_entry + +neon_3bit: + // BitWidth 3: 8 int32 values packed in 3 bytes + // Process 8 values at a time using scalar operations + + // Round down to multiple of 8 for processing + MOVD R1, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length (multiple of 8) + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback + +neon_3bit_loop: + // Load 3 bytes as 32-bit value (4th byte will be ignored) + // Bytes 0-2 contain: [val7:val6:val5:val4:val3:val2:val1:val0] + // Bits layout: [23:21][20:18][17:15][14:12][11:9][8:6][5:3][2:0] + MOVWU (R2), R6 + + // Value 0: bits 0-2 + AND $7, R6, R7 + MOVW R7, (R0) + + // Value 1: bits 3-5 + LSR $3, R6, R7 + AND $7, R7, R7 + MOVW R7, 4(R0) + + // Value 2: bits 6-8 + LSR $6, R6, R7 + AND $7, R7, R7 + MOVW R7, 8(R0) + + // Value 3: bits 9-11 + LSR $9, R6, R7 + AND $7, R7, R7 + MOVW R7, 12(R0) + + // Value 4: bits 12-14 + LSR $12, R6, R7 + AND $7, R7, R7 + MOVW R7, 16(R0) + + // Value 5: bits 15-17 + LSR $15, R6, R7 + AND $7, R7, R7 + MOVW R7, 20(R0) + + // Value 6: bits 18-20 + LSR $18, R6, R7 + AND $7, R7, R7 + MOVW R7, 24(R0) + + // Value 7: bits 21-23 + LSR $21, R6, R7 + AND $7, R7, R7 + MOVW R7, 28(R0) + + // Advance pointers + ADD $3, R2, R2 // src += 3 bytes (8 values) + ADD $32, R0, R0 // dst += 8 int32 (32 bytes) + ADD $8, R5, R5 // index += 8 + + CMP R4, R5 + BLT neon_3bit_loop + + CMP R1, R5 + BEQ neon_done + SUB R5, R1, R1 + B scalar_fallback_entry + +neon_4bit: + // BitWidth 4: 4 int32 values packed in 2 bytes + // Process 4 values at a time using scalar operations + + MOVD R1, R4 + LSR $2, R4, R4 // R4 = len / 4 + LSL $2, R4, R4 // R4 = aligned length (multiple of 4) + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback + +neon_4bit_loop: + // Load 2 bytes (contains 4 values, 4 bits each) + MOVHU (R2), R6 + + // Extract 4 values manually (nibbles) + // Value 0: bits 0-3 + AND $15, R6, R7 + MOVW R7, (R0) + + // Value 1: bits 4-7 + LSR $4, R6, R7 + AND $15, R7, R7 + MOVW R7, 4(R0) + + // Value 2: bits 8-11 + LSR $8, R6, R7 + AND $15, R7, R7 + MOVW R7, 8(R0) + + // Value 3: bits 12-15 + LSR $12, R6, R7 + AND $15, R7, R7 + MOVW R7, 12(R0) + + // Advance pointers + ADD $2, R2, R2 // src += 2 bytes (4 values) + ADD $16, R0, R0 // dst += 4 int32 (16 bytes) + ADD $4, R5, R5 // index += 4 + + CMP R4, R5 + BLT neon_4bit_loop + + CMP R1, R5 + BEQ neon_done + SUB R5, R1, R1 + B scalar_fallback_entry + +neon_5bit: + // BitWidth 5: 8 int32 values packed in 5 bytes + // Process 8 values at a time using scalar operations + + // Round down to multiple of 8 for processing + MOVD R1, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length (multiple of 8) + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback + +neon_5bit_loop: + // Load 5 bytes as 64-bit value (upper bytes will be ignored) + // 8 values × 5 bits = 40 bits = 5 bytes + // Bits layout: [39:35][34:30][29:25][24:20][19:15][14:10][9:5][4:0] + MOVD (R2), R6 + + // Value 0: bits 0-4 + AND $31, R6, R7 + MOVW R7, (R0) + + // Value 1: bits 5-9 + LSR $5, R6, R7 + AND $31, R7, R7 + MOVW R7, 4(R0) + + // Value 2: bits 10-14 + LSR $10, R6, R7 + AND $31, R7, R7 + MOVW R7, 8(R0) + + // Value 3: bits 15-19 + LSR $15, R6, R7 + AND $31, R7, R7 + MOVW R7, 12(R0) + + // Value 4: bits 20-24 + LSR $20, R6, R7 + AND $31, R7, R7 + MOVW R7, 16(R0) + + // Value 5: bits 25-29 + LSR $25, R6, R7 + AND $31, R7, R7 + MOVW R7, 20(R0) + + // Value 6: bits 30-34 + LSR $30, R6, R7 + AND $31, R7, R7 + MOVW R7, 24(R0) + + // Value 7: bits 35-39 + LSR $35, R6, R7 + AND $31, R7, R7 + MOVW R7, 28(R0) + + // Advance pointers + ADD $5, R2, R2 // src += 5 bytes (8 values) + ADD $32, R0, R0 // dst += 8 int32 (32 bytes) + ADD $8, R5, R5 // index += 8 + + CMP R4, R5 + BLT neon_5bit_loop + + CMP R1, R5 + BEQ neon_done + SUB R5, R1, R1 + B scalar_fallback_entry + +neon_6bit: + // BitWidth 6: 4 int32 values packed in 3 bytes + // Process 4 values at a time using scalar operations + + MOVD R1, R4 + LSR $2, R4, R4 // R4 = len / 4 + LSL $2, R4, R4 // R4 = aligned length (multiple of 4) + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback + +neon_6bit_loop: + // Load 3 bytes as 32-bit value (4th byte will be ignored) + // 4 values × 6 bits = 24 bits = 3 bytes + // Bits layout: [23:18][17:12][11:6][5:0] + MOVWU (R2), R6 + + // Value 0: bits 0-5 + AND $63, R6, R7 + MOVW R7, (R0) + + // Value 1: bits 6-11 + LSR $6, R6, R7 + AND $63, R7, R7 + MOVW R7, 4(R0) + + // Value 2: bits 12-17 + LSR $12, R6, R7 + AND $63, R7, R7 + MOVW R7, 8(R0) + + // Value 3: bits 18-23 + LSR $18, R6, R7 + AND $63, R7, R7 + MOVW R7, 12(R0) + + // Advance pointers + ADD $3, R2, R2 // src += 3 bytes (4 values) + ADD $16, R0, R0 // dst += 4 int32 (16 bytes) + ADD $4, R5, R5 // index += 4 + + CMP R4, R5 + BLT neon_6bit_loop + + CMP R1, R5 + BEQ neon_done + SUB R5, R1, R1 + B scalar_fallback_entry + +neon_7bit: + // BitWidth 7: 8 int32 values packed in 7 bytes + // Process 8 values at a time using scalar operations + + // Round down to multiple of 8 for processing + MOVD R1, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length (multiple of 8) + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback + +neon_7bit_loop: + // Load 7 bytes as 64-bit value (8th byte will be ignored) + // 8 values × 7 bits = 56 bits = 7 bytes + // Bits layout: [55:49][48:42][41:35][34:28][27:21][20:14][13:7][6:0] + MOVD (R2), R6 + + // Value 0: bits 0-6 + AND $127, R6, R7 + MOVW R7, (R0) + + // Value 1: bits 7-13 + LSR $7, R6, R7 + AND $127, R7, R7 + MOVW R7, 4(R0) + + // Value 2: bits 14-20 + LSR $14, R6, R7 + AND $127, R7, R7 + MOVW R7, 8(R0) + + // Value 3: bits 21-27 + LSR $21, R6, R7 + AND $127, R7, R7 + MOVW R7, 12(R0) + + // Value 4: bits 28-34 + LSR $28, R6, R7 + AND $127, R7, R7 + MOVW R7, 16(R0) + + // Value 5: bits 35-41 + LSR $35, R6, R7 + AND $127, R7, R7 + MOVW R7, 20(R0) + + // Value 6: bits 42-48 + LSR $42, R6, R7 + AND $127, R7, R7 + MOVW R7, 24(R0) + + // Value 7: bits 49-55 + LSR $49, R6, R7 + AND $127, R7, R7 + MOVW R7, 28(R0) + + // Advance pointers + ADD $7, R2, R2 // src += 7 bytes (8 values) + ADD $32, R0, R0 // dst += 8 int32 (32 bytes) + ADD $8, R5, R5 // index += 8 + + CMP R4, R5 + BLT neon_7bit_loop + + CMP R1, R5 + BEQ neon_done + SUB R5, R1, R1 + B scalar_fallback_entry + +neon_8bit: + // BitWidth 8: 4 int32 values packed in 4 bytes + // Process 4 values at a time using NEON + + // Calculate how many full groups of 4 we can process + MOVD R1, R4 + LSR $2, R4, R4 // R4 = len / 4 + LSL $2, R4, R4 // R4 = (len / 4) * 4 = aligned length + + MOVD $0, R5 // R5 = index + CMP $0, R4 + BEQ scalar_fallback + +neon_8bit_loop: + // Load 4 bytes as 4 uint8 values into lower part of V0 + // We need to load bytes and zero-extend to 32-bit + + // Load 4 bytes to W6 + MOVWU (R2), R6 + + // Extract bytes and write as int32 + // Byte 0 + AND $0xFF, R6, R7 + MOVW R7, (R0) + + // Byte 1 + LSR $8, R6, R7 + AND $0xFF, R7, R7 + MOVW R7, 4(R0) + + // Byte 2 + LSR $16, R6, R7 + AND $0xFF, R7, R7 + MOVW R7, 8(R0) + + // Byte 3 + LSR $24, R6, R7 + MOVW R7, 12(R0) + + // Advance pointers + ADD $4, R2, R2 // src += 4 bytes + ADD $16, R0, R0 // dst += 4 int32 (16 bytes) + ADD $4, R5, R5 // index += 4 + + CMP R4, R5 + BLT neon_8bit_loop + + // Handle tail with scalar + CMP R1, R5 + BEQ neon_done + + // Calculate remaining elements + SUB R5, R1, R1 // R1 = remaining elements + B scalar_fallback_entry + +neon_16bit: + // BitWidth 16: 4 int32 values packed in 8 bytes + // Process 4 values at a time + + MOVD R1, R4 + LSR $2, R4, R4 // R4 = len / 4 + LSL $2, R4, R4 // R4 = (len / 4) * 4 + + MOVD $0, R5 // R5 = index + CMP $0, R4 + BEQ scalar_fallback + +neon_16bit_loop: + // Load 8 bytes as 4 uint16 values + MOVD (R2), R6 // Load 8 bytes into R6 + + // Extract 16-bit values and write as int32 + // Value 0 (bits 0-15) + AND $0xFFFF, R6, R7 + MOVW R7, (R0) + + // Value 1 (bits 16-31) + LSR $16, R6, R7 + AND $0xFFFF, R7, R7 + MOVW R7, 4(R0) + + // Value 2 (bits 32-47) + LSR $32, R6, R7 + AND $0xFFFF, R7, R7 + MOVW R7, 8(R0) + + // Value 3 (bits 48-63) + LSR $48, R6, R7 + MOVW R7, 12(R0) + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $16, R0, R0 // dst += 4 int32 (16 bytes) + ADD $4, R5, R5 // index += 4 + + CMP R4, R5 + BLT neon_16bit_loop + + // Handle tail with scalar + CMP R1, R5 + BEQ neon_done + + SUB R5, R1, R1 + B scalar_fallback_entry + +neon_done: + RET + +scalar_fallback: + MOVD $0, R5 // Start from beginning + // R0, R1, R2, R3 already set from function args + +scalar_fallback_entry: + // R0 = current dst position (already advanced) + // R1 = remaining elements + // R2 = current src position (already advanced) + // R3 = bitWidth + // R5 = elements already processed + + // Fall back to scalar implementation for remaining elements + CMP $0, R1 + BEQ scalar_done // No remaining elements + + MOVD $1, R4 // R4 = bitMask = (1 << bitWidth) - 1 + LSL R3, R4, R4 + SUB $1, R4, R4 + + // bitOffset starts from 0 relative to current R2 position + // (not total offset, since R2 is already advanced) + MOVD $0, R6 // R6 = bitOffset (relative to current R2) + MOVD $0, R7 // R7 = index (within remaining elements) + B scalar_test + +scalar_loop: + MOVD R6, R8 // R8 = i = bitOffset / 32 + LSR $5, R8, R8 + + MOVD R6, R9 // R9 = j = bitOffset % 32 + AND $31, R9, R9 + + LSL $2, R8, R10 // R10 = i * 4 + MOVWU (R2)(R10), R11 // R11 = src[i] (relative to current R2) + MOVW R4, R12 // R12 = bitMask + LSL R9, R12, R12 // R12 = bitMask << j + AND R12, R11, R11 // R11 = src[i] & (bitMask << j) + LSR R9, R11, R11 // R11 = d = (src[i] & (bitMask << j)) >> j + + ADD R3, R9, R12 // R12 = j + bitWidth + CMP $32, R12 + BLE scalar_next // if j+bitWidth <= 32, skip to next + + ADD $1, R8, R13 // R13 = i + 1 + LSL $2, R13, R10 // R10 = (i + 1) * 4 + MOVWU (R2)(R10), R14 // R14 = src[i+1] + + MOVD $32, R15 // R15 = k = 32 - j + SUB R9, R15, R15 + + MOVW R4, R16 // R16 = bitMask + LSR R15, R16, R16 // R16 = bitMask >> k + AND R16, R14, R14 // R14 = src[i+1] & (bitMask >> k) + LSL R15, R14, R14 // R14 = (src[i+1] & (bitMask >> k)) << k + ORR R14, R11, R11 // R11 = d | c + +scalar_next: + LSL $2, R7, R10 // R10 = index * 4 + MOVW R11, (R0)(R10) // dst[index] = d (relative to current R0) + ADD R3, R6, R6 // bitOffset += bitWidth + ADD $1, R7, R7 // index++ + +scalar_test: + CMP R1, R7 + BLT scalar_loop + +scalar_done: + RET + +// Macro definitions for unsupported NEON instructions using WORD encodings +// USHLL Vd.8H, Vn.8B, #0 - widen 8x8-bit to 8x16-bit +#define USHLL_8H_8B(vd, vn) WORD $(0x2f08a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.8H, Vn.16B, #0 - widen upper 8x8-bit to 8x16-bit +#define USHLL2_8H_16B(vd, vn) WORD $(0x6f08a400 | (vd) | ((vn)<<5)) + +// USHLL Vd.4S, Vn.4H, #0 - widen 4x16-bit to 4x32-bit +#define USHLL_4S_4H(vd, vn) WORD $(0x2f10a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.4S, Vn.8H, #0 - widen upper 4x16-bit to 4x32-bit +#define USHLL2_4S_8H(vd, vn) WORD $(0x6f10a400 | (vd) | ((vn)<<5)) + +// USHLL Vd.2D, Vn.2S, #0 - widen 2x32-bit to 2x64-bit +#define USHLL_2D_2S(vd, vn) WORD $(0x2f20a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.2D, Vn.4S, #0 - widen upper 2x32-bit to 2x64-bit +#define USHLL2_2D_4S(vd, vn) WORD $(0x6f20a400 | (vd) | ((vn)<<5)) + +// Bit expansion lookup table defined in bitexpand_table_arm64.s + +// unpackInt32x1bitNEON implements table-based NEON unpacking for bitWidth=1 +// Uses lookup tables for parallel bit expansion +// +// func unpackInt32x1bitNEON(dst []int32, src []byte, bitWidth uint) diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_be.go b/vendor/github.com/parquet-go/bitpack/unpack_int32_be.go similarity index 100% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_be.go rename to vendor/github.com/parquet-go/bitpack/unpack_int32_be.go diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_le.go b/vendor/github.com/parquet-go/bitpack/unpack_int32_le.go similarity index 67% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_le.go rename to vendor/github.com/parquet-go/bitpack/unpack_int32_le.go index f754e704ff1..035f6341ea3 100644 --- a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_le.go +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_le.go @@ -2,7 +2,7 @@ package bitpack -import "github.com/parquet-go/parquet-go/internal/unsafecast" +import "github.com/parquet-go/bitpack/unsafecast" func unsafecastBytesToUint32(src []byte) []uint32 { return unsafecast.Slice[uint32](src) diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_purego.go b/vendor/github.com/parquet-go/bitpack/unpack_int32_purego.go similarity index 91% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_purego.go rename to vendor/github.com/parquet-go/bitpack/unpack_int32_purego.go index 1e65d8c02b9..71477f6e725 100644 --- a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int32_purego.go +++ b/vendor/github.com/parquet-go/bitpack/unpack_int32_purego.go @@ -1,4 +1,4 @@ -//go:build purego || !amd64 +//go:build purego || (!amd64 && !arm64) package bitpack diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int64_1bit_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int64_1bit_arm64.s new file mode 100644 index 00000000000..605d6bd3e8d --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int64_1bit_arm64.s @@ -0,0 +1,239 @@ +//go:build !purego + +#include "textflag.h" +#include "unpack_neon_macros_arm64.h" + +// unpackInt64x1bitNEON implements NEON unpacking for bitWidth=1 using direct bit manipulation +// Each byte contains 8 bits: [bit7][bit6][bit5][bit4][bit3][bit2][bit1][bit0] +// +// func unpackInt64x1bitNEON(dst []int64, src []byte, bitWidth uint) +TEXT ·unpackInt64x1bitNEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth (should be 1) + + MOVD $0, R5 // R5 = index (initialize early for tail path) + + // Check if we have at least 64 values to process + CMP $64, R1 + BLT neon1_tail_int64 + + // Round down to multiple of 64 for NEON processing + MOVD R1, R4 + LSR $6, R4, R4 // R4 = len / 64 + LSL $6, R4, R4 // R4 = aligned length (multiple of 64) + + // Load mask for 1 bit (0x01010101...) + MOVD $0x0101010101010101, R6 + VMOV R6, V31.D[0] + VMOV R6, V31.D[1] // V31 = mask for single bits + +neon1_loop_int64: + // Load 8 bytes (contains 64 x 1-bit values) + VLD1 (R2), [V0.B8] + + // Extract each bit position (8 separate streams) + VAND V31.B16, V0.B16, V1.B16 // V1 = bit 0 + + VUSHR $1, V0.B16, V2.B16 + VAND V31.B16, V2.B16, V2.B16 // V2 = bit 1 + + VUSHR $2, V0.B16, V3.B16 + VAND V31.B16, V3.B16, V3.B16 // V3 = bit 2 + + VUSHR $3, V0.B16, V4.B16 + VAND V31.B16, V4.B16, V4.B16 // V4 = bit 3 + + VUSHR $4, V0.B16, V5.B16 + VAND V31.B16, V5.B16, V5.B16 // V5 = bit 4 + + VUSHR $5, V0.B16, V6.B16 + VAND V31.B16, V6.B16, V6.B16 // V6 = bit 5 + + VUSHR $6, V0.B16, V7.B16 + VAND V31.B16, V7.B16, V7.B16 // V7 = bit 6 + + VUSHR $7, V0.B16, V8.B16 + VAND V31.B16, V8.B16, V8.B16 // V8 = bit 7 + + // Stage 1: ZIP pairs (8 streams → 4 streams of pairs) + VZIP1 V2.B8, V1.B8, V9.B8 // V9 = [bit0,bit1] interleaved + VZIP1 V4.B8, V3.B8, V10.B8 // V10 = [bit2,bit3] interleaved + VZIP1 V6.B8, V5.B8, V11.B8 // V11 = [bit4,bit5] interleaved + VZIP1 V8.B8, V7.B8, V12.B8 // V12 = [bit6,bit7] interleaved + + VZIP2 V2.B8, V1.B8, V13.B8 // V13 = [bit0,bit1] upper half + VZIP2 V4.B8, V3.B8, V14.B8 // V14 = [bit2,bit3] upper half + VZIP2 V6.B8, V5.B8, V15.B8 // V15 = [bit4,bit5] upper half + VZIP2 V8.B8, V7.B8, V16.B8 // V16 = [bit6,bit7] upper half + + // Stage 2: ZIP quads (4 streams → 2 streams of quads) + VZIP1 V10.H4, V9.H4, V17.H4 // V17 = [0,1,2,3] interleaved + VZIP1 V12.H4, V11.H4, V18.H4 // V18 = [4,5,6,7] interleaved + VZIP2 V10.H4, V9.H4, V19.H4 // V19 = [0,1,2,3] next + VZIP2 V12.H4, V11.H4, V20.H4 // V20 = [4,5,6,7] next + + VZIP1 V14.H4, V13.H4, V21.H4 // V21 = upper [0,1,2,3] + VZIP1 V16.H4, V15.H4, V22.H4 // V22 = upper [4,5,6,7] + VZIP2 V14.H4, V13.H4, V23.H4 // V23 = upper [0,1,2,3] next + VZIP2 V16.H4, V15.H4, V24.H4 // V24 = upper [4,5,6,7] next + + // Stage 3: ZIP octets (2 streams → fully sequential) + VZIP1 V18.S2, V17.S2, V25.S2 // V25 = values 0-7 + VZIP2 V18.S2, V17.S2, V26.S2 // V26 = values 8-15 + VZIP1 V20.S2, V19.S2, V27.S2 // V27 = values 16-23 + VZIP2 V20.S2, V19.S2, V28.S2 // V28 = values 24-31 + VZIP1 V22.S2, V21.S2, V1.S2 // V1 = values 32-39 + VZIP2 V22.S2, V21.S2, V2.S2 // V2 = values 40-47 + VZIP1 V24.S2, V23.S2, V3.S2 // V3 = values 48-55 + VZIP2 V24.S2, V23.S2, V4.S2 // V4 = values 56-63 + + // Widen to int64 and store - each group of 8 values + // Values 0-7 + USHLL_8H_8B(5, 25) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + USHLL_2D_2S(8, 6) + USHLL2_2D_4S(9, 6) + USHLL_2D_2S(10, 7) + USHLL2_2D_4S(11, 7) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + + // Values 8-15 + USHLL_8H_8B(5, 26) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + USHLL_2D_2S(8, 6) + USHLL2_2D_4S(9, 6) + USHLL_2D_2S(10, 7) + USHLL2_2D_4S(11, 7) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + + // Values 16-23 + USHLL_8H_8B(5, 27) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + USHLL_2D_2S(8, 6) + USHLL2_2D_4S(9, 6) + USHLL_2D_2S(10, 7) + USHLL2_2D_4S(11, 7) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + + // Values 24-31 + USHLL_8H_8B(5, 28) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + USHLL_2D_2S(8, 6) + USHLL2_2D_4S(9, 6) + USHLL_2D_2S(10, 7) + USHLL2_2D_4S(11, 7) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + + // Values 32-39 + USHLL_8H_8B(5, 1) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + USHLL_2D_2S(8, 6) + USHLL2_2D_4S(9, 6) + USHLL_2D_2S(10, 7) + USHLL2_2D_4S(11, 7) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + + // Values 40-47 + USHLL_8H_8B(5, 2) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + USHLL_2D_2S(8, 6) + USHLL2_2D_4S(9, 6) + USHLL_2D_2S(10, 7) + USHLL2_2D_4S(11, 7) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + + // Values 48-55 + USHLL_8H_8B(5, 3) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + USHLL_2D_2S(8, 6) + USHLL2_2D_4S(9, 6) + USHLL_2D_2S(10, 7) + USHLL2_2D_4S(11, 7) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + + // Values 56-63 + USHLL_8H_8B(5, 4) + USHLL_4S_4H(6, 5) + USHLL2_4S_8H(7, 5) + USHLL_2D_2S(8, 6) + USHLL2_2D_4S(9, 6) + USHLL_2D_2S(10, 7) + USHLL2_2D_4S(11, 7) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $64, R5, R5 // index += 64 + + CMP R4, R5 + BLT neon1_loop_int64 + +neon1_tail_int64: + // Handle remaining elements with scalar fallback + CMP R1, R5 + BEQ neon1_done_int64 + + // Compute remaining elements + SUB R5, R1, R1 + + // Fall back to scalar unpack for tail + MOVD $1, R4 // bitMask = 1 + MOVD $0, R6 // bitOffset = 0 + MOVD $0, R7 // index = 0 + B neon1_scalar_test_int64 + +neon1_scalar_loop_int64: + MOVD R6, R8 + LSR $3, R8, R8 // byte_index = bitOffset / 8 + MOVBU (R2)(R8), R9 // Load byte + + MOVD R6, R10 + AND $7, R10, R10 // bit_offset = bitOffset % 8 + + LSR R10, R9, R9 // Shift right by bit offset + AND $1, R9, R9 // Mask to get bit + MOVD R9, (R0) // Store as int64 + + ADD $8, R0, R0 // dst++ + ADD $1, R6, R6 // bitOffset++ + ADD $1, R7, R7 // index++ + +neon1_scalar_test_int64: + CMP R1, R7 + BLT neon1_scalar_loop_int64 + +neon1_done_int64: + RET diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int64_2bit_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int64_2bit_arm64.s new file mode 100644 index 00000000000..3b7ac771354 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int64_2bit_arm64.s @@ -0,0 +1,161 @@ +//go:build !purego + +#include "textflag.h" +#include "unpack_neon_macros_arm64.h" + +// unpackInt64x2bitNEON implements NEON unpacking for bitWidth=2 using direct bit manipulation +// Each byte contains 4 values of 2 bits each: [bits 6-7][bits 4-5][bits 2-3][bits 0-1] +// +// func unpackInt64x2bitNEON(dst []int64, src []byte, bitWidth uint) +TEXT ·unpackInt64x2bitNEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth (should be 2) + + MOVD $0, R5 // R5 = index (initialize early for tail path) + + // Check if we have at least 32 values to process + CMP $32, R1 + BLT neon2_tail_int64 + + // Round down to multiple of 32 for NEON processing + MOVD R1, R4 + LSR $5, R4, R4 // R4 = len / 32 + LSL $5, R4, R4 // R4 = aligned length (multiple of 32) + + // Load mask for 2 bits (0x03030303...) + MOVD $0x0303030303030303, R6 + VMOV R6, V31.D[0] + VMOV R6, V31.D[1] // V31 = mask for 2-bit values + +neon2_loop_int64: + // Load 8 bytes (contains 32 x 2-bit values) + VLD1 (R2), [V0.B8] + + // Extract bits [1:0] from each byte (values at positions 0,4,8,12,...) + VAND V31.B16, V0.B16, V1.B16 + + // Extract bits [3:2] from each byte (values at positions 1,5,9,13,...) + VUSHR $2, V0.B16, V2.B16 + VAND V31.B16, V2.B16, V2.B16 + + // Extract bits [5:4] from each byte (values at positions 2,6,10,14,...) + VUSHR $4, V0.B16, V3.B16 + VAND V31.B16, V3.B16, V3.B16 + + // Extract bits [7:6] from each byte (values at positions 3,7,11,15,...) + VUSHR $6, V0.B16, V4.B16 + VAND V31.B16, V4.B16, V4.B16 + + // Use multiple ZIP stages to interleave + VZIP1 V2.B8, V1.B8, V5.B8 // V5 = [V1[0],V2[0],V1[1],V2[1],V1[2],V2[2],V1[3],V2[3]] + VZIP1 V4.B8, V3.B8, V6.B8 // V6 = [V3[0],V4[0],V3[1],V4[1],V3[2],V4[2],V3[3],V4[3]] + VZIP2 V2.B8, V1.B8, V7.B8 // V7 = [V1[4],V2[4],V1[5],V2[5],V1[6],V2[6],V1[7],V2[7]] + VZIP2 V4.B8, V3.B8, V8.B8 // V8 = [V3[4],V4[4],V3[5],V4[5],V3[6],V4[6],V3[7],V4[7]] + + // Now ZIP the pairs + VZIP1 V6.H4, V5.H4, V13.H4 // V13 = [V1[0],V2[0],V3[0],V4[0],V1[1],V2[1],V3[1],V4[1]] + VZIP2 V6.H4, V5.H4, V14.H4 // V14 = [V1[2],V2[2],V3[2],V4[2],V1[3],V2[3],V3[3],V4[3]] + VZIP1 V8.H4, V7.H4, V15.H4 // V15 = [V1[4],V2[4],V3[4],V4[4],V1[5],V2[5],V3[5],V4[5]] + VZIP2 V8.H4, V7.H4, V16.H4 // V16 = [V1[6],V2[6],V3[6],V4[6],V1[7],V2[7],V3[7],V4[7]] + + // Widen first 8 values (V13) to int64 + USHLL_8H_8B(17, 13) // V17.8H ← V13.8B + USHLL_4S_4H(18, 17) // V18.4S ← V17.4H + USHLL2_4S_8H(19, 17) // V19.4S ← V17.8H + USHLL_2D_2S(20, 18) // V20.2D ← V18.2S (values 0-1) + USHLL2_2D_4S(21, 18) // V21.2D ← V18.4S (values 2-3) + USHLL_2D_2S(22, 19) // V22.2D ← V19.2S (values 4-5) + USHLL2_2D_4S(23, 19) // V23.2D ← V19.4S (values 6-7) + + // Widen second 8 values (V14) to int64 + USHLL_8H_8B(24, 14) // V24.8H ← V14.8B + USHLL_4S_4H(25, 24) // V25.4S ← V24.4H + USHLL2_4S_8H(26, 24) // V26.4S ← V24.8H + USHLL_2D_2S(27, 25) // V27.2D ← V25.2S (values 8-9) + USHLL2_2D_4S(28, 25) // V28.2D ← V25.4S (values 10-11) + USHLL_2D_2S(29, 26) // V29.2D ← V26.2S (values 12-13) + USHLL2_2D_4S(30, 26) // V30.2D ← V26.4S (values 14-15) + + // Store first 16 int64 values (128 bytes) + VST1 [V20.D2, V21.D2], (R0) + ADD $32, R0, R0 + VST1 [V22.D2, V23.D2], (R0) + ADD $32, R0, R0 + VST1 [V27.D2, V28.D2], (R0) + ADD $32, R0, R0 + VST1 [V29.D2, V30.D2], (R0) + ADD $32, R0, R0 + + // Widen third 8 values (V15) to int64 + USHLL_8H_8B(17, 15) // V17.8H ← V15.8B (reuse V17) + USHLL_4S_4H(18, 17) // V18.4S ← V17.4H + USHLL2_4S_8H(19, 17) // V19.4S ← V17.8H + USHLL_2D_2S(20, 18) // V20.2D ← V18.2S (values 16-17) + USHLL2_2D_4S(21, 18) // V21.2D ← V18.4S (values 18-19) + USHLL_2D_2S(22, 19) // V22.2D ← V19.2S (values 20-21) + USHLL2_2D_4S(23, 19) // V23.2D ← V19.4S (values 22-23) + + // Widen fourth 8 values (V16) to int64 + USHLL_8H_8B(24, 16) // V24.8H ← V16.8B (reuse V24) + USHLL_4S_4H(25, 24) // V25.4S ← V24.4H + USHLL2_4S_8H(26, 24) // V26.4S ← V24.8H + USHLL_2D_2S(27, 25) // V27.2D ← V25.2S (values 24-25) + USHLL2_2D_4S(28, 25) // V28.2D ← V25.4S (values 26-27) + USHLL_2D_2S(29, 26) // V29.2D ← V26.2S (values 28-29) + USHLL2_2D_4S(30, 26) // V30.2D ← V26.4S (values 30-31) + + // Store second 16 int64 values (128 bytes) + VST1 [V20.D2, V21.D2], (R0) + ADD $32, R0, R0 + VST1 [V22.D2, V23.D2], (R0) + ADD $32, R0, R0 + VST1 [V27.D2, V28.D2], (R0) + ADD $32, R0, R0 + VST1 [V29.D2, V30.D2], (R0) + ADD $32, R0, R0 + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $32, R5, R5 // index += 32 + + CMP R4, R5 + BLT neon2_loop_int64 + +neon2_tail_int64: + // Handle remaining elements with scalar fallback + CMP R1, R5 + BEQ neon2_done_int64 + + // Compute remaining elements + SUB R5, R1, R1 + + // Fall back to scalar unpack for tail + MOVD $3, R4 // bitMask = 3 (0b11 for 2 bits) + MOVD $0, R6 // bitOffset = 0 + MOVD $0, R7 // index = 0 + B neon2_scalar_test_int64 + +neon2_scalar_loop_int64: + MOVD R6, R8 + LSR $3, R8, R8 // byte_index = bitOffset / 8 + MOVBU (R2)(R8), R9 // Load byte + + MOVD R6, R10 + AND $7, R10, R10 // bit_offset = bitOffset % 8 + + LSR R10, R9, R9 // Shift right by bit offset + AND $3, R9, R9 // Mask to get 2 bits + MOVD R9, (R0) // Store as int64 + + ADD $8, R0, R0 // dst++ + ADD $2, R6, R6 // bitOffset += 2 + ADD $1, R7, R7 // index++ + +neon2_scalar_test_int64: + CMP R1, R7 + BLT neon2_scalar_loop_int64 + +neon2_done_int64: + RET diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int64_4bit_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int64_4bit_arm64.s new file mode 100644 index 00000000000..671b6ea0bea --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int64_4bit_arm64.s @@ -0,0 +1,118 @@ +//go:build !purego + +#include "textflag.h" +#include "unpack_neon_macros_arm64.h" + +// unpackInt64x4bitNEON implements NEON unpacking for bitWidth=4 using direct bit manipulation +// Each byte contains 2 values of 4 bits each +// +// func unpackInt64x4bitNEON(dst []int64, src []byte, bitWidth uint) +TEXT ·unpackInt64x4bitNEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth (should be 4) + + MOVD $0, R5 // R5 = index (initialize early for tail path) + + // Check if we have at least 16 values to process + CMP $16, R1 + BLT neon4_tail + + // Round down to multiple of 16 for NEON processing + MOVD R1, R4 + LSR $4, R4, R4 // R4 = len / 16 + LSL $4, R4, R4 // R4 = aligned length (multiple of 16) + + // Load mask for 4 bits (0x0F0F0F0F...) + MOVD $0x0F0F0F0F0F0F0F0F, R6 + VMOV R6, V31.D[0] + VMOV R6, V31.D[1] // V31 = mask for low nibbles + +neon4_loop: + // Load 8 bytes (contains 16 x 4-bit values) + VLD1 (R2), [V0.B8] + + // Extract low nibbles (values at even nibble positions) + VAND V31.B16, V0.B16, V1.B16 // V1 = low nibbles + + // Extract high nibbles (values at odd nibble positions) + VUSHR $4, V0.B16, V2.B16 // V2 = high nibbles (shifted down) + VAND V31.B16, V2.B16, V2.B16 // V2 = high nibbles (masked) + + // Now V1 has values [0,2,4,6,8,10,12,14] and V2 has [1,3,5,7,9,11,13,15] + // We need to interleave them: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + VZIP1 V2.B8, V1.B8, V3.B8 // V3 = interleaved low half (values 0-7) + VZIP2 V2.B8, V1.B8, V4.B8 // V4 = interleaved high half (values 8-15) + + // Widen first 8 values (V3) to int64 + USHLL_8H_8B(5, 3) // V5.8H ← V3.8B + USHLL_4S_4H(6, 5) // V6.4S ← V5.4H + USHLL2_4S_8H(7, 5) // V7.4S ← V5.8H + USHLL_2D_2S(8, 6) // V8.2D ← V6.2S (values 0-1) + USHLL2_2D_4S(9, 6) // V9.2D ← V6.4S (values 2-3) + USHLL_2D_2S(10, 7) // V10.2D ← V7.2S (values 4-5) + USHLL2_2D_4S(11, 7) // V11.2D ← V7.4S (values 6-7) + + // Widen second 8 values (V4) to int64 + USHLL_8H_8B(12, 4) // V12.8H ← V4.8B + USHLL_4S_4H(13, 12) // V13.4S ← V12.4H + USHLL2_4S_8H(14, 12) // V14.4S ← V12.8H + USHLL_2D_2S(15, 13) // V15.2D ← V13.2S (values 8-9) + USHLL2_2D_4S(16, 13) // V16.2D ← V13.4S (values 10-11) + USHLL_2D_2S(17, 14) // V17.2D ← V14.2S (values 12-13) + USHLL2_2D_4S(18, 14) // V18.2D ← V14.4S (values 14-15) + + // Store 16 int64 values (128 bytes) + VST1 [V8.D2, V9.D2], (R0) + ADD $32, R0, R0 + VST1 [V10.D2, V11.D2], (R0) + ADD $32, R0, R0 + VST1 [V15.D2, V16.D2], (R0) + ADD $32, R0, R0 + VST1 [V17.D2, V18.D2], (R0) + ADD $32, R0, R0 + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $16, R5, R5 // index += 16 + + CMP R4, R5 + BLT neon4_loop + +neon4_tail: + // Handle remaining elements with scalar fallback + CMP R1, R5 + BEQ neon4_done + + // Compute remaining elements + SUB R5, R1, R1 + + // Fall back to scalar unpack for tail + MOVD $0x0F, R4 // bitMask = 0x0F (4 bits) + MOVD $0, R6 // bitOffset = 0 (start from current R2 position) + MOVD $0, R7 // loop counter = 0 + B neon4_scalar_test + +neon4_scalar_loop: + MOVD R6, R8 + LSR $3, R8, R8 // byte_index = bitOffset / 8 + MOVBU (R2)(R8), R9 // Load byte from current position + + MOVD R6, R10 + AND $7, R10, R10 // bit_offset = bitOffset % 8 + + LSR R10, R9, R9 // Shift right by bit offset + AND $0x0F, R9, R9 // Mask to get 4 bits + MOVD R9, (R0) // Store as int64 + + ADD $8, R0, R0 // dst++ + ADD $4, R6, R6 // bitOffset += 4 + ADD $1, R7, R7 // counter++ + +neon4_scalar_test: + CMP R1, R7 + BLT neon4_scalar_loop + +neon4_done: + RET diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int64_8bit_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int64_8bit_arm64.s new file mode 100644 index 00000000000..2e8a4e479ff --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int64_8bit_arm64.s @@ -0,0 +1,71 @@ +//go:build !purego + +#include "textflag.h" +#include "unpack_neon_macros_arm64.h" + +// unpackInt64x8bitNEON implements NEON unpacking for bitWidth=8 +// Each byte is already a complete value - just widen to int64 +// Processes 8 values at a time using NEON +// +// func unpackInt64x8bitNEON(dst []int64, src []byte, bitWidth uint) +TEXT ·unpackInt64x8bitNEON(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth (should be 8) + + MOVD $0, R5 // R5 = index + + // Check if we have at least 8 values to process + CMP $8, R1 + BLT tbl8_tail + + // Round down to multiple of 8 for NEON processing + MOVD R1, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length (multiple of 8) + +tbl8_loop: + // Load 8 bytes (8 x 8-bit values) + VLD1 (R2), [V0.B8] + + // Widen to int64: byte → short → int → long + USHLL_8H_8B(1, 0) // V1.8H ← V0.8B (8x8-bit → 8x16-bit) + USHLL_4S_4H(2, 1) // V2.4S ← V1.4H (lower 4x16-bit → 4x32-bit) + USHLL2_4S_8H(3, 1) // V3.4S ← V1.8H (upper 4x16-bit → 4x32-bit) + USHLL_2D_2S(4, 2) // V4.2D ← V2.2S (lower 2x32-bit → 2x64-bit) + USHLL2_2D_4S(5, 2) // V5.2D ← V2.4S (upper 2x32-bit → 2x64-bit) + USHLL_2D_2S(6, 3) // V6.2D ← V3.2S (lower 2x32-bit → 2x64-bit) + USHLL2_2D_4S(7, 3) // V7.2D ← V3.4S (upper 2x32-bit → 2x64-bit) + + // Store 8 int64 values (64 bytes) + VST1 [V4.D2, V5.D2], (R0) + ADD $32, R0, R11 // Temporary pointer for second store + VST1 [V6.D2, V7.D2], (R11) + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes + ADD $64, R0, R0 // dst += 8 int64 (64 bytes) + ADD $8, R5, R5 // index += 8 + + CMP R4, R5 + BLT tbl8_loop + +tbl8_tail: + // Handle remaining elements (0-7) one by one + CMP R1, R5 + BGE tbl8_done + +tbl8_tail_loop: + MOVBU (R2), R6 // Load byte + MOVD R6, (R0) // Store as int64 (zero-extended) + + ADD $1, R2, R2 // src++ + ADD $8, R0, R0 // dst++ + ADD $1, R5, R5 // index++ + + CMP R1, R5 + BLT tbl8_tail_loop + +tbl8_done: + RET diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int64_amd64.go b/vendor/github.com/parquet-go/bitpack/unpack_int64_amd64.go similarity index 90% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int64_amd64.go rename to vendor/github.com/parquet-go/bitpack/unpack_int64_amd64.go index 9314e73c2e9..5c53a31adfa 100644 --- a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int64_amd64.go +++ b/vendor/github.com/parquet-go/bitpack/unpack_int64_amd64.go @@ -3,7 +3,7 @@ package bitpack import ( - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" "golang.org/x/sys/cpu" ) diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int64_amd64.s b/vendor/github.com/parquet-go/bitpack/unpack_int64_amd64.s similarity index 100% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int64_amd64.s rename to vendor/github.com/parquet-go/bitpack/unpack_int64_amd64.s diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int64_arm64.go b/vendor/github.com/parquet-go/bitpack/unpack_int64_arm64.go new file mode 100644 index 00000000000..02e6afc9943 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int64_arm64.go @@ -0,0 +1,50 @@ +//go:build !purego + +package bitpack + +import ( + "github.com/parquet-go/bitpack/unsafecast" +) + +//go:noescape +func unpackInt64Default(dst []int64, src []byte, bitWidth uint) + +//go:noescape +func unpackInt64x1to32bitsARM64(dst []int64, src []byte, bitWidth uint) + +//go:noescape +func unpackInt64x1bitNEON(dst []int64, src []byte, bitWidth uint) + +//go:noescape +func unpackInt64x2bitNEON(dst []int64, src []byte, bitWidth uint) + +//go:noescape +func unpackInt64x3bitNEON(dst []int64, src []byte, bitWidth uint) + +//go:noescape +func unpackInt64x4bitNEON(dst []int64, src []byte, bitWidth uint) + +//go:noescape +func unpackInt64x8bitNEON(dst []int64, src []byte, bitWidth uint) + +func unpackInt64(dst []int64, src []byte, bitWidth uint) { + // For ARM64, NEON (Advanced SIMD) is always available + // Use table-based NEON operations for small bit widths + switch { + case bitWidth == 1: + unpackInt64x1bitNEON(dst, src, bitWidth) + case bitWidth == 2: + unpackInt64x2bitNEON(dst, src, bitWidth) + case bitWidth == 4: + unpackInt64x4bitNEON(dst, src, bitWidth) + case bitWidth == 8: + unpackInt64x8bitNEON(dst, src, bitWidth) + // bitWidth == 3,5,6,7: Skip NEON table (don't divide evenly into 8) + case bitWidth <= 32: + unpackInt64x1to32bitsARM64(dst, src, bitWidth) + case bitWidth == 64: + copy(dst, unsafecast.Slice[int64](src)) + default: + unpackInt64Default(dst, src, bitWidth) + } +} diff --git a/vendor/github.com/parquet-go/bitpack/unpack_int64_arm64.s b/vendor/github.com/parquet-go/bitpack/unpack_int64_arm64.s new file mode 100644 index 00000000000..2c638fbfe6d --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_int64_arm64.s @@ -0,0 +1,943 @@ +//go:build !purego + +#include "funcdata.h" +#include "textflag.h" + +// func unpackInt64Default(dst []int64, src []byte, bitWidth uint) +TEXT ·unpackInt64Default(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth + + // Initialize registers + MOVD $0, R5 // R5 = bitOffset + MOVD $0, R6 // R6 = index + + // Check if length >= 4 for unrolled loop + CMP $4, R1 + BLT scalar_loop_start + + // Calculate bitMask = (1 << bitWidth) - 1 + MOVD $1, R4 + LSL R3, R4, R4 + SUB $1, R4, R4 // R4 = bitMask + + // Calculate unrolled iterations: (length / 4) * 4 + LSR $2, R1, R16 // R16 = length / 4 + CBZ R16, scalar_loop_start + LSL $2, R16, R16 // R16 = (length / 4) * 4 + +unrolled_loop: + // Process 4 elements with instruction-level parallelism + // Use 64-bit loads for better performance + + // === Element 0 === + LSR $6, R5, R7 // i = bitOffset / 64 + AND $63, R5, R8 // j = bitOffset % 64 + MOVD (R2)(R7<<3), R9 // load 64-bit word from src[i] + LSL R8, R4, R10 + AND R10, R9, R9 + LSR R8, R9, R9 + + // Check if value spans into next word + ADD R8, R3, R11 + CMP $64, R11 + BLE store0 + MOVD $64, R12 + SUB R8, R12, R12 + ADD $1, R7, R13 + MOVD (R2)(R13<<3), R14 + LSR R12, R4, R15 + AND R15, R14, R14 + LSL R12, R14, R14 + ORR R14, R9, R9 + +store0: + ADD R3, R5, R5 // bitOffset += bitWidth + MOVD R9, (R0)(R6<<3) + ADD $1, R6, R6 + + // === Element 1 === + LSR $6, R5, R7 + AND $63, R5, R8 + MOVD (R2)(R7<<3), R9 + LSL R8, R4, R10 + AND R10, R9, R9 + LSR R8, R9, R9 + + ADD R8, R3, R11 + CMP $64, R11 + BLE store1 + MOVD $64, R12 + SUB R8, R12, R12 + ADD $1, R7, R13 + MOVD (R2)(R13<<3), R14 + LSR R12, R4, R15 + AND R15, R14, R14 + LSL R12, R14, R14 + ORR R14, R9, R9 + +store1: + ADD R3, R5, R5 + MOVD R9, (R0)(R6<<3) + ADD $1, R6, R6 + + // === Element 2 === + LSR $6, R5, R7 + AND $63, R5, R8 + MOVD (R2)(R7<<3), R9 + LSL R8, R4, R10 + AND R10, R9, R9 + LSR R8, R9, R9 + + ADD R8, R3, R11 + CMP $64, R11 + BLE store2 + MOVD $64, R12 + SUB R8, R12, R12 + ADD $1, R7, R13 + MOVD (R2)(R13<<3), R14 + LSR R12, R4, R15 + AND R15, R14, R14 + LSL R12, R14, R14 + ORR R14, R9, R9 + +store2: + ADD R3, R5, R5 + MOVD R9, (R0)(R6<<3) + ADD $1, R6, R6 + + // === Element 3 === + LSR $6, R5, R7 + AND $63, R5, R8 + MOVD (R2)(R7<<3), R9 + LSL R8, R4, R10 + AND R10, R9, R9 + LSR R8, R9, R9 + + ADD R8, R3, R11 + CMP $64, R11 + BLE store3 + MOVD $64, R12 + SUB R8, R12, R12 + ADD $1, R7, R13 + MOVD (R2)(R13<<3), R14 + LSR R12, R4, R15 + AND R15, R14, R14 + LSL R12, R14, R14 + ORR R14, R9, R9 + +store3: + ADD R3, R5, R5 + MOVD R9, (R0)(R6<<3) + ADD $1, R6, R6 + + CMP R16, R6 + BLT unrolled_loop + + // Check if done + CMP R1, R6 + BEQ done + +scalar_loop_start: + // Fallback scalar loop for remaining elements + MOVD $1, R4 + LSL R3, R4, R4 + SUB $1, R4, R4 // R4 = bitMask + +scalar_loop: + LSR $6, R5, R7 // i = bitOffset / 64 + AND $63, R5, R8 // j = bitOffset % 64 + MOVD (R2)(R7<<3), R9 // load 64-bit word + LSL R8, R4, R10 // bitMask << j + AND R10, R9, R9 + LSR R8, R9, R9 // extracted value + + // Check for span + ADD R8, R3, R11 + CMP $64, R11 + BLE scalar_next + MOVD $64, R12 + SUB R8, R12, R12 // k = 64 - j + ADD $1, R7, R13 + MOVD (R2)(R13<<3), R14 + LSR R12, R4, R15 + AND R15, R14, R14 + LSL R12, R14, R14 + ORR R14, R9, R9 + +scalar_next: + MOVD R9, (R0)(R6<<3) // dst[index] = d + ADD R3, R5, R5 // bitOffset += bitWidth + ADD $1, R6, R6 // index++ + +scalar_test: + CMP R1, R6 + BNE scalar_loop + +done: + RET + +// unpackInt64x1to32bitsARM64 implements optimized unpacking for bit widths 1-32 +// Uses optimized scalar ARM64 operations with batched processing +// +// func unpackInt64x1to32bitsARM64(dst []int64, src []byte, bitWidth uint) +TEXT ·unpackInt64x1to32bitsARM64(SB), NOSPLIT, $0-56 + MOVD dst_base+0(FP), R0 // R0 = dst pointer + MOVD dst_len+8(FP), R1 // R1 = dst length + MOVD src_base+24(FP), R2 // R2 = src pointer + MOVD bitWidth+48(FP), R3 // R3 = bitWidth + + // Check if we have at least 4 values to process + CMP $4, R1 + BLT scalar_fallback_int64 + + // Determine which path to use based on bitWidth + CMP $1, R3 + BEQ int64_1bit + CMP $2, R3 + BEQ int64_2bit + CMP $3, R3 + BEQ int64_3bit + CMP $4, R3 + BEQ int64_4bit + CMP $5, R3 + BEQ int64_5bit + CMP $6, R3 + BEQ int64_6bit + CMP $7, R3 + BEQ int64_7bit + CMP $8, R3 + BEQ int64_8bit + CMP $16, R3 + BEQ int64_16bit + CMP $32, R3 + BEQ int64_32bit + + // For other bit widths, fall back to scalar + B scalar_fallback_int64 + +int64_1bit: + // BitWidth 1: 8 int64 values packed in 1 byte + // Process 8 values at a time + + // Round down to multiple of 8 for processing + MOVD R1, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length (multiple of 8) + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_1bit_loop: + // Load 1 byte (contains 8 values, 1 bit each) + MOVBU (R2), R6 + + // Extract 8 bits + AND $1, R6, R7 + MOVD R7, (R0) + LSR $1, R6, R7 + AND $1, R7, R7 + MOVD R7, 8(R0) + LSR $2, R6, R7 + AND $1, R7, R7 + MOVD R7, 16(R0) + LSR $3, R6, R7 + AND $1, R7, R7 + MOVD R7, 24(R0) + LSR $4, R6, R7 + AND $1, R7, R7 + MOVD R7, 32(R0) + LSR $5, R6, R7 + AND $1, R7, R7 + MOVD R7, 40(R0) + LSR $6, R6, R7 + AND $1, R7, R7 + MOVD R7, 48(R0) + LSR $7, R6, R7 + AND $1, R7, R7 + MOVD R7, 56(R0) + + ADD $1, R2, R2 + ADD $64, R0, R0 + ADD $8, R5, R5 + + CMP R4, R5 + BLT int64_1bit_loop + + CMP R1, R5 + BEQ int64_done + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_2bit: + // BitWidth 2: 8 int64 values packed in 2 bytes + MOVD R1, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_2bit_loop: + MOVHU (R2), R6 + + AND $3, R6, R7 + MOVD R7, (R0) + LSR $2, R6, R7 + AND $3, R7, R7 + MOVD R7, 8(R0) + LSR $4, R6, R7 + AND $3, R7, R7 + MOVD R7, 16(R0) + LSR $6, R6, R7 + AND $3, R7, R7 + MOVD R7, 24(R0) + LSR $8, R6, R7 + AND $3, R7, R7 + MOVD R7, 32(R0) + LSR $10, R6, R7 + AND $3, R7, R7 + MOVD R7, 40(R0) + LSR $12, R6, R7 + AND $3, R7, R7 + MOVD R7, 48(R0) + LSR $14, R6, R7 + AND $3, R7, R7 + MOVD R7, 56(R0) + + ADD $2, R2, R2 + ADD $64, R0, R0 + ADD $8, R5, R5 + + CMP R4, R5 + BLT int64_2bit_loop + + CMP R1, R5 + BEQ int64_done + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_3bit: + // BitWidth 3: 8 int64 values packed in 3 bytes + MOVD R1, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_3bit_loop: + MOVWU (R2), R6 + + AND $7, R6, R7 + MOVD R7, (R0) + LSR $3, R6, R7 + AND $7, R7, R7 + MOVD R7, 8(R0) + LSR $6, R6, R7 + AND $7, R7, R7 + MOVD R7, 16(R0) + LSR $9, R6, R7 + AND $7, R7, R7 + MOVD R7, 24(R0) + LSR $12, R6, R7 + AND $7, R7, R7 + MOVD R7, 32(R0) + LSR $15, R6, R7 + AND $7, R7, R7 + MOVD R7, 40(R0) + LSR $18, R6, R7 + AND $7, R7, R7 + MOVD R7, 48(R0) + LSR $21, R6, R7 + AND $7, R7, R7 + MOVD R7, 56(R0) + + ADD $3, R2, R2 + ADD $64, R0, R0 + ADD $8, R5, R5 + + CMP R4, R5 + BLT int64_3bit_loop + + CMP R1, R5 + BEQ int64_done + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_4bit: + // BitWidth 4: 8 int64 values packed in 4 bytes + MOVD R1, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_4bit_loop: + MOVWU (R2), R6 + + AND $15, R6, R7 + MOVD R7, (R0) + LSR $4, R6, R7 + AND $15, R7, R7 + MOVD R7, 8(R0) + LSR $8, R6, R7 + AND $15, R7, R7 + MOVD R7, 16(R0) + LSR $12, R6, R7 + AND $15, R7, R7 + MOVD R7, 24(R0) + LSR $16, R6, R7 + AND $15, R7, R7 + MOVD R7, 32(R0) + LSR $20, R6, R7 + AND $15, R7, R7 + MOVD R7, 40(R0) + LSR $24, R6, R7 + AND $15, R7, R7 + MOVD R7, 48(R0) + LSR $28, R6, R7 + AND $15, R7, R7 + MOVD R7, 56(R0) + + ADD $4, R2, R2 + ADD $64, R0, R0 + ADD $8, R5, R5 + + CMP R4, R5 + BLT int64_4bit_loop + + CMP R1, R5 + BEQ int64_done + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_5bit: + // BitWidth 5: 8 int64 values packed in 5 bytes + MOVD R1, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_5bit_loop: + MOVD (R2), R6 + + AND $31, R6, R7 + MOVD R7, (R0) + LSR $5, R6, R7 + AND $31, R7, R7 + MOVD R7, 8(R0) + LSR $10, R6, R7 + AND $31, R7, R7 + MOVD R7, 16(R0) + LSR $15, R6, R7 + AND $31, R7, R7 + MOVD R7, 24(R0) + LSR $20, R6, R7 + AND $31, R7, R7 + MOVD R7, 32(R0) + LSR $25, R6, R7 + AND $31, R7, R7 + MOVD R7, 40(R0) + LSR $30, R6, R7 + AND $31, R7, R7 + MOVD R7, 48(R0) + LSR $35, R6, R7 + AND $31, R7, R7 + MOVD R7, 56(R0) + + ADD $5, R2, R2 + ADD $64, R0, R0 + ADD $8, R5, R5 + + CMP R4, R5 + BLT int64_5bit_loop + + CMP R1, R5 + BEQ int64_done + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_6bit: + // BitWidth 6: 8 int64 values packed in 6 bytes + MOVD R1, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_6bit_loop: + MOVD (R2), R6 + + AND $63, R6, R7 + MOVD R7, (R0) + LSR $6, R6, R7 + AND $63, R7, R7 + MOVD R7, 8(R0) + LSR $12, R6, R7 + AND $63, R7, R7 + MOVD R7, 16(R0) + LSR $18, R6, R7 + AND $63, R7, R7 + MOVD R7, 24(R0) + LSR $24, R6, R7 + AND $63, R7, R7 + MOVD R7, 32(R0) + LSR $30, R6, R7 + AND $63, R7, R7 + MOVD R7, 40(R0) + LSR $36, R6, R7 + AND $63, R7, R7 + MOVD R7, 48(R0) + LSR $42, R6, R7 + AND $63, R7, R7 + MOVD R7, 56(R0) + + ADD $6, R2, R2 + ADD $64, R0, R0 + ADD $8, R5, R5 + + CMP R4, R5 + BLT int64_6bit_loop + + CMP R1, R5 + BEQ int64_done + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_7bit: + // BitWidth 7: 8 int64 values packed in 7 bytes + MOVD R1, R4 + LSR $3, R4, R4 + LSL $3, R4, R4 + + MOVD $0, R5 + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_7bit_loop: + MOVD (R2), R6 + + AND $127, R6, R7 + MOVD R7, (R0) + LSR $7, R6, R7 + AND $127, R7, R7 + MOVD R7, 8(R0) + LSR $14, R6, R7 + AND $127, R7, R7 + MOVD R7, 16(R0) + LSR $21, R6, R7 + AND $127, R7, R7 + MOVD R7, 24(R0) + LSR $28, R6, R7 + AND $127, R7, R7 + MOVD R7, 32(R0) + LSR $35, R6, R7 + AND $127, R7, R7 + MOVD R7, 40(R0) + LSR $42, R6, R7 + AND $127, R7, R7 + MOVD R7, 48(R0) + LSR $49, R6, R7 + AND $127, R7, R7 + MOVD R7, 56(R0) + + ADD $7, R2, R2 + ADD $64, R0, R0 + ADD $8, R5, R5 + + CMP R4, R5 + BLT int64_7bit_loop + + CMP R1, R5 + BEQ int64_done + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_8bit: + // BitWidth 8: 8 int64 values packed in 8 bytes + // Process 8 values at a time + + // Round down to multiple of 8 for processing + MOVD R1, R4 + LSR $3, R4, R4 // R4 = len / 8 + LSL $3, R4, R4 // R4 = aligned length (multiple of 8) + + MOVD $0, R5 // R5 = index + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_8bit_loop: + // Load 8 bytes (contains 8 values, 1 byte each) + MOVD (R2), R6 + + // Extract 8 bytes and store as int64 + // Value 0: byte 0 + AND $0xFF, R6, R7 + MOVD R7, (R0) + + // Value 1: byte 1 + LSR $8, R6, R7 + AND $0xFF, R7, R7 + MOVD R7, 8(R0) + + // Value 2: byte 2 + LSR $16, R6, R7 + AND $0xFF, R7, R7 + MOVD R7, 16(R0) + + // Value 3: byte 3 + LSR $24, R6, R7 + AND $0xFF, R7, R7 + MOVD R7, 24(R0) + + // Value 4: byte 4 + LSR $32, R6, R7 + AND $0xFF, R7, R7 + MOVD R7, 32(R0) + + // Value 5: byte 5 + LSR $40, R6, R7 + AND $0xFF, R7, R7 + MOVD R7, 40(R0) + + // Value 6: byte 6 + LSR $48, R6, R7 + AND $0xFF, R7, R7 + MOVD R7, 48(R0) + + // Value 7: byte 7 + LSR $56, R6, R7 + MOVD R7, 56(R0) + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes (8 values) + ADD $64, R0, R0 // dst += 8 int64 (64 bytes) + ADD $8, R5, R5 // index += 8 + + CMP R4, R5 + BLT int64_8bit_loop + + // Handle tail with scalar + CMP R1, R5 + BEQ int64_done + + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_16bit: + // BitWidth 16: 4 int64 values packed in 8 bytes + // Process 4 values at a time + + MOVD R1, R4 + LSR $2, R4, R4 // R4 = len / 4 + LSL $2, R4, R4 // R4 = aligned length (multiple of 4) + + MOVD $0, R5 // R5 = index + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_16bit_loop: + // Load 8 bytes as 4 uint16 values + MOVD (R2), R6 + + // Extract 16-bit values and write as int64 + // Value 0 (bits 0-15) + AND $0xFFFF, R6, R7 + MOVD R7, (R0) + + // Value 1 (bits 16-31) + LSR $16, R6, R7 + AND $0xFFFF, R7, R7 + MOVD R7, 8(R0) + + // Value 2 (bits 32-47) + LSR $32, R6, R7 + AND $0xFFFF, R7, R7 + MOVD R7, 16(R0) + + // Value 3 (bits 48-63) + LSR $48, R6, R7 + MOVD R7, 24(R0) + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes (4 values) + ADD $32, R0, R0 // dst += 4 int64 (32 bytes) + ADD $4, R5, R5 // index += 4 + + CMP R4, R5 + BLT int64_16bit_loop + + // Handle tail with scalar + CMP R1, R5 + BEQ int64_done + + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_32bit: + // BitWidth 32: 2 int64 values packed in 8 bytes + // Process 2 values at a time + + MOVD R1, R4 + LSR $1, R4, R4 // R4 = len / 2 + LSL $1, R4, R4 // R4 = aligned length (multiple of 2) + + MOVD $0, R5 // R5 = index + CMP $0, R4 + BEQ scalar_fallback_int64 + +int64_32bit_loop: + // Load 8 bytes as 2 uint32 values + MOVD (R2), R6 + + // Extract 32-bit values and write as int64 + // Value 0 (bits 0-31) + AND $0xFFFFFFFF, R6, R7 + MOVD R7, (R0) + + // Value 1 (bits 32-63) + LSR $32, R6, R7 + MOVD R7, 8(R0) + + // Advance pointers + ADD $8, R2, R2 // src += 8 bytes (2 values) + ADD $16, R0, R0 // dst += 2 int64 (16 bytes) + ADD $2, R5, R5 // index += 2 + + CMP R4, R5 + BLT int64_32bit_loop + + // Handle tail with scalar + CMP R1, R5 + BEQ int64_done + + SUB R5, R1, R1 + B scalar_fallback_entry_int64 + +int64_done: + RET + +scalar_fallback_int64: + MOVD $0, R5 // Start from beginning + +scalar_fallback_entry_int64: + // R0 = current dst position (already advanced) + // R1 = remaining elements + // R2 = current src position (already advanced) + // R3 = bitWidth + // R5 = elements already processed + + // Fall back to optimized implementation for remaining elements + CMP $0, R1 + BEQ scalar_done_int64 // No remaining elements + + // Check if we can do 4-way unrolled loop + CMP $4, R1 + BLT scalar_single_int64 + + // Calculate bitMask + MOVD $1, R4 + LSL R3, R4, R4 + SUB $1, R4, R4 // R4 = bitMask + + // Calculate unrolled iterations: (remaining / 4) * 4 + LSR $2, R1, R16 + CBZ R16, scalar_single_int64 + LSL $2, R16, R16 // R16 = (len / 4) * 4 + + MOVD $0, R6 // R6 = bitOffset + MOVD $0, R7 // R7 = index + +scalar_unrolled_loop_int64: + // === Element 0 === + LSR $6, R6, R8 // i = bitOffset / 64 + AND $63, R6, R9 // j = bitOffset % 64 + MOVD (R2)(R8<<3), R11 // load 64-bit word + LSL R9, R4, R12 + AND R12, R11, R11 + LSR R9, R11, R11 + + ADD R9, R3, R12 + CMP $64, R12 + BLE scalar_store0_int64 + MOVD $64, R13 + SUB R9, R13, R13 + ADD $1, R8, R14 + MOVD (R2)(R14<<3), R15 + LSR R13, R4, R10 + AND R10, R15, R15 + LSL R13, R15, R15 + ORR R15, R11, R11 + +scalar_store0_int64: + ADD R3, R6, R6 + LSL $3, R7, R10 + MOVD R11, (R0)(R10) + ADD $1, R7, R7 + + // === Element 1 === + LSR $6, R6, R8 + AND $63, R6, R9 + MOVD (R2)(R8<<3), R11 + LSL R9, R4, R12 + AND R12, R11, R11 + LSR R9, R11, R11 + + ADD R9, R3, R12 + CMP $64, R12 + BLE scalar_store1_int64 + MOVD $64, R13 + SUB R9, R13, R13 + ADD $1, R8, R14 + MOVD (R2)(R14<<3), R15 + LSR R13, R4, R10 + AND R10, R15, R15 + LSL R13, R15, R15 + ORR R15, R11, R11 + +scalar_store1_int64: + ADD R3, R6, R6 + LSL $3, R7, R10 + MOVD R11, (R0)(R10) + ADD $1, R7, R7 + + // === Element 2 === + LSR $6, R6, R8 + AND $63, R6, R9 + MOVD (R2)(R8<<3), R11 + LSL R9, R4, R12 + AND R12, R11, R11 + LSR R9, R11, R11 + + ADD R9, R3, R12 + CMP $64, R12 + BLE scalar_store2_int64 + MOVD $64, R13 + SUB R9, R13, R13 + ADD $1, R8, R14 + MOVD (R2)(R14<<3), R15 + LSR R13, R4, R10 + AND R10, R15, R15 + LSL R13, R15, R15 + ORR R15, R11, R11 + +scalar_store2_int64: + ADD R3, R6, R6 + LSL $3, R7, R10 + MOVD R11, (R0)(R10) + ADD $1, R7, R7 + + // === Element 3 === + LSR $6, R6, R8 + AND $63, R6, R9 + MOVD (R2)(R8<<3), R11 + LSL R9, R4, R12 + AND R12, R11, R11 + LSR R9, R11, R11 + + ADD R9, R3, R12 + CMP $64, R12 + BLE scalar_store3_int64 + MOVD $64, R13 + SUB R9, R13, R13 + ADD $1, R8, R14 + MOVD (R2)(R14<<3), R15 + LSR R13, R4, R10 + AND R10, R15, R15 + LSL R13, R15, R15 + ORR R15, R11, R11 + +scalar_store3_int64: + ADD R3, R6, R6 + LSL $3, R7, R10 + MOVD R11, (R0)(R10) + ADD $1, R7, R7 + + CMP R16, R7 + BLT scalar_unrolled_loop_int64 + + // Check if done + CMP R1, R7 + BEQ scalar_done_int64 + + // Preserve R6 (bitOffset), R4 (bitMask), and R7 (index) for tail processing + // R1 still contains total count, R7 has current index + B scalar_loop_single_int64 + +scalar_single_int64: + // Process remaining elements one at a time + MOVD $1, R4 + LSL R3, R4, R4 + SUB $1, R4, R4 // R4 = bitMask + + MOVD $0, R6 // R6 = bitOffset + MOVD $0, R7 // R7 = index + +scalar_loop_single_int64: + LSR $6, R6, R8 // i = bitOffset / 64 + AND $63, R6, R9 // j = bitOffset % 64 + MOVD (R2)(R8<<3), R11 // load 64-bit word + LSL R9, R4, R12 + AND R12, R11, R11 + LSR R9, R11, R11 + + ADD R9, R3, R12 + CMP $64, R12 + BLE scalar_next_single_int64 + MOVD $64, R13 + SUB R9, R13, R13 + ADD $1, R8, R14 + MOVD (R2)(R14<<3), R15 + LSR R13, R4, R10 + AND R10, R15, R15 + LSL R13, R15, R15 + ORR R15, R11, R11 + +scalar_next_single_int64: + LSL $3, R7, R10 + MOVD R11, (R0)(R10) + ADD R3, R6, R6 + ADD $1, R7, R7 + + CMP R1, R7 + BLT scalar_loop_single_int64 + +scalar_done_int64: + RET + +// Macro definitions for unsupported NEON instructions using WORD encodings +// USHLL Vd.8H, Vn.8B, #0 - widen 8x8-bit to 8x16-bit +#define USHLL_8H_8B(vd, vn) WORD $(0x2f08a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.8H, Vn.16B, #0 - widen upper 8x8-bit to 8x16-bit +#define USHLL2_8H_16B(vd, vn) WORD $(0x6f08a400 | (vd) | ((vn)<<5)) + +// USHLL Vd.4S, Vn.4H, #0 - widen 4x16-bit to 4x32-bit +#define USHLL_4S_4H(vd, vn) WORD $(0x2f10a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.4S, Vn.8H, #0 - widen upper 4x16-bit to 4x32-bit +#define USHLL2_4S_8H(vd, vn) WORD $(0x6f10a400 | (vd) | ((vn)<<5)) + +// USHLL Vd.2D, Vn.2S, #0 - widen 2x32-bit to 2x64-bit +#define USHLL_2D_2S(vd, vn) WORD $(0x2f20a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.2D, Vn.4S, #0 - widen upper 2x32-bit to 2x64-bit +#define USHLL2_2D_4S(vd, vn) WORD $(0x6f20a400 | (vd) | ((vn)<<5)) + +// unpackInt64x1bitNEON implements table-based NEON unpacking for int64 bitWidth=1 +// Similar to int32 version but with additional widening to 64-bit +// +// func unpackInt64x1bitNEON(dst []int64, src []byte, bitWidth uint) diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int64_purego.go b/vendor/github.com/parquet-go/bitpack/unpack_int64_purego.go similarity index 92% rename from vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int64_purego.go rename to vendor/github.com/parquet-go/bitpack/unpack_int64_purego.go index 7a20882efbc..3d7b1183282 100644 --- a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack_int64_purego.go +++ b/vendor/github.com/parquet-go/bitpack/unpack_int64_purego.go @@ -1,4 +1,4 @@ -//go:build purego || !amd64 +//go:build purego || (!amd64 && !arm64) package bitpack diff --git a/vendor/github.com/parquet-go/bitpack/unpack_neon_macros_arm64.h b/vendor/github.com/parquet-go/bitpack/unpack_neon_macros_arm64.h new file mode 100644 index 00000000000..7ba1c8f7b01 --- /dev/null +++ b/vendor/github.com/parquet-go/bitpack/unpack_neon_macros_arm64.h @@ -0,0 +1,18 @@ +// Macro definitions for unsupported NEON instructions using WORD encodings +// USHLL Vd.8H, Vn.8B, #0 - widen 8x8-bit to 8x16-bit +#define USHLL_8H_8B(vd, vn) WORD $(0x2f08a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.8H, Vn.16B, #0 - widen upper 8x8-bit to 8x16-bit +#define USHLL2_8H_16B(vd, vn) WORD $(0x6f08a400 | (vd) | ((vn)<<5)) + +// USHLL Vd.4S, Vn.4H, #0 - widen 4x16-bit to 4x32-bit +#define USHLL_4S_4H(vd, vn) WORD $(0x2f10a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.4S, Vn.8H, #0 - widen upper 4x16-bit to 4x32-bit +#define USHLL2_4S_8H(vd, vn) WORD $(0x6f10a400 | (vd) | ((vn)<<5)) + +// USHLL Vd.2D, Vn.2S, #0 - widen 2x32-bit to 2x64-bit +#define USHLL_2D_2S(vd, vn) WORD $(0x2f20a400 | (vd) | ((vn)<<5)) + +// USHLL2 Vd.2D, Vn.4S, #0 - widen upper 2x32-bit to 2x64-bit +#define USHLL2_2D_4S(vd, vn) WORD $(0x6f20a400 | (vd) | ((vn)<<5)) diff --git a/vendor/github.com/parquet-go/parquet-go/internal/unsafecast/unsafecast.go b/vendor/github.com/parquet-go/bitpack/unsafecast/unsafecast.go similarity index 100% rename from vendor/github.com/parquet-go/parquet-go/internal/unsafecast/unsafecast.go rename to vendor/github.com/parquet-go/bitpack/unsafecast/unsafecast.go diff --git a/vendor/github.com/parquet-go/parquet-go/allocator.go b/vendor/github.com/parquet-go/parquet-go/allocator.go index d1500643b3c..0c1c3ae261b 100644 --- a/vendor/github.com/parquet-go/parquet-go/allocator.go +++ b/vendor/github.com/parquet-go/parquet-go/allocator.go @@ -3,7 +3,7 @@ package parquet import ( "unsafe" - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" ) type allocator struct{ buffer []byte } diff --git a/vendor/github.com/parquet-go/parquet-go/bloom.go b/vendor/github.com/parquet-go/parquet-go/bloom.go index 911de082122..550cf21192f 100644 --- a/vendor/github.com/parquet-go/parquet-go/bloom.go +++ b/vendor/github.com/parquet-go/parquet-go/bloom.go @@ -3,12 +3,12 @@ package parquet import ( "io" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/bloom" "github.com/parquet-go/parquet-go/bloom/xxhash" "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) // BloomFilter is an interface allowing applications to test whether a key diff --git a/vendor/github.com/parquet-go/parquet-go/bloom/filter.go b/vendor/github.com/parquet-go/parquet-go/bloom/filter.go index 11cc255a1c1..32c7630d18f 100644 --- a/vendor/github.com/parquet-go/parquet-go/bloom/filter.go +++ b/vendor/github.com/parquet-go/parquet-go/bloom/filter.go @@ -4,7 +4,7 @@ import ( "io" "sync" - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" ) // Filter is an interface representing read-only bloom filters where programs diff --git a/vendor/github.com/parquet-go/parquet-go/bloom_le.go b/vendor/github.com/parquet-go/parquet-go/bloom_le.go index 5b93bf07177..6f8bbdcd846 100644 --- a/vendor/github.com/parquet-go/parquet-go/bloom_le.go +++ b/vendor/github.com/parquet-go/parquet-go/bloom_le.go @@ -3,8 +3,8 @@ package parquet import ( + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/deprecated" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) func unsafecastInt96ToBytes(src []deprecated.Int96) []byte { diff --git a/vendor/github.com/parquet-go/parquet-go/buffer.go b/vendor/github.com/parquet-go/parquet-go/buffer.go index 637c0477ecc..2fc8ffc32e9 100644 --- a/vendor/github.com/parquet-go/parquet-go/buffer.go +++ b/vendor/github.com/parquet-go/parquet-go/buffer.go @@ -82,7 +82,7 @@ func bufferFuncOf[T any](t reflect.Type, schema *Schema) bufferFunc[T] { } func makeBufferFunc[T any](t reflect.Type, schema *Schema) bufferFunc[T] { - writeRows := writeRowsFuncOf(t, schema, nil) + writeRows := writeRowsFuncOf(t, schema, nil, nil) return func(buf *GenericBuffer[T], rows []T) (n int, err error) { err = writeRows(buf.base.columns, makeArrayOf(rows), columnLevels{}) if err == nil { @@ -264,9 +264,19 @@ func (buf *Buffer) configure(schema *Schema) { column := columnType.NewColumnBuffer(columnIndex, bufferCap) switch { case leaf.maxRepetitionLevel > 0: - column = newRepeatedColumnBuffer(column, leaf.maxRepetitionLevel, leaf.maxDefinitionLevel, nullOrdering) + // The Buffer implementation does not have a Close method, so we should do direct + // allocation to avoid breaking existing users. + n := column.Cap() + repetitionLevels := make([]byte, 0, n) + definitionLevels := make([]byte, 0, n) + column = newRepeatedColumnBuffer(column, repetitionLevels, definitionLevels, leaf.maxRepetitionLevel, leaf.maxDefinitionLevel, nullOrdering) case leaf.maxDefinitionLevel > 0: - column = newOptionalColumnBuffer(column, leaf.maxDefinitionLevel, nullOrdering) + // The Buffer implementation does not have a Close method, so we should do direct + // allocation to avoid breaking existing users. + n := column.Cap() + rows := make([]int32, 0, n) + definitionLevels := make([]byte, 0, n) + column = newOptionalColumnBuffer(column, rows, definitionLevels, leaf.maxDefinitionLevel, nullOrdering) } buf.columns = append(buf.columns, column) @@ -467,36 +477,44 @@ var ( _ ValueWriter = (*bufferWriter)(nil) ) -type buffer struct { - data []byte - refc uintptr - pool *bufferPool +type bufferedType interface{ byte | int32 | uint32 } + +type buffer[T bufferedType] struct { + data []T + refc atomic.Int32 + pool *bufferPool[T] stack []byte + id uint64 } -func (b *buffer) refCount() int { - return int(atomic.LoadUintptr(&b.refc)) +func newBuffer[T bufferedType](data []T) *buffer[T] { + b := &buffer[T]{data: data} + b.refc.Store(1) + return b } -func (b *buffer) ref() { - atomic.AddUintptr(&b.refc, +1) +func (b *buffer[T]) ref() { + if b.refc.Add(1) <= 1 { + panic("BUG: buffer reference count overflow") + } } -func (b *buffer) unref() { - if atomic.AddUintptr(&b.refc, ^uintptr(0)) == 0 { - if b.pool != nil { - b.pool.put(b) - } +func (b *buffer[T]) unref() { + switch refc := b.refc.Add(-1); { + case refc < 0: + panic("BUG: buffer reference count underflow") + case refc == 0 && b.pool != nil: + b.pool.put(b) } } -func monitorBufferRelease(b *buffer) { - if rc := b.refCount(); rc != 0 { - log.Printf("PARQUETGODEBUG: buffer garbage collected with non-zero reference count\n%s", string(b.stack)) +func monitorBufferRelease[T bufferedType](b *buffer[T]) { + if rc := b.refc.Load(); rc != 0 { + log.Printf("PARQUETGODEBUG: buffer[%d] garbage collected with non-zero reference count (rc=%d)\n%s", b.id, rc, string(b.stack)) } } -type bufferPool struct { +type bufferPool[T bufferedType] struct { // Buckets are split in two groups for short and large buffers. In the short // buffer group (below 256KB), the growth rate between each bucket is 2. The // growth rate changes to 1.5 in the larger buffer group. @@ -512,47 +530,46 @@ type bufferPool struct { buckets [bufferPoolBucketCount]sync.Pool } -func (p *bufferPool) newBuffer(bufferSize, bucketSize int) *buffer { - b := &buffer{ - data: make([]byte, bufferSize, bucketSize), - refc: 1, +func (p *bufferPool[T]) newBuffer(bufferSize, bucketSize int) *buffer[T] { + b := &buffer[T]{ + data: make([]T, bufferSize, bucketSize), pool: p, } if debug.TRACEBUF > 0 { b.stack = make([]byte, 4096) - runtime.SetFinalizer(b, monitorBufferRelease) + runtime.SetFinalizer(b, monitorBufferRelease[T]) } return b } // get returns a buffer from the levelled buffer pool. size is used to choose // the appropriate pool. -func (p *bufferPool) get(bufferSize int) *buffer { +func (p *bufferPool[T]) get(bufferSize int) *buffer[T] { bucketIndex, bucketSize := bufferPoolBucketIndexAndSizeOfGet(bufferSize) - b := (*buffer)(nil) + var b *buffer[T] = nil if bucketIndex >= 0 { - b, _ = p.buckets[bucketIndex].Get().(*buffer) + b, _ = p.buckets[bucketIndex].Get().(*buffer[T]) } if b == nil { b = p.newBuffer(bufferSize, bucketSize) } else { b.data = b.data[:bufferSize] - b.ref() } if debug.TRACEBUF > 0 { b.stack = b.stack[:runtime.Stack(b.stack[:cap(b.stack)], false)] } + b.refc.Store(1) return b } -func (p *bufferPool) put(b *buffer) { +func (p *bufferPool[T]) put(b *buffer[T]) { if b.pool != p { panic("BUG: buffer returned to a different pool than the one it was allocated from") } - if b.refCount() != 0 { + if b.refc.Load() != 0 { panic("BUG: buffer returned to pool with a non-zero reference count") } if bucketIndex, _ := bufferPoolBucketIndexAndSizeOfPut(cap(b.data)); bucketIndex >= 0 { @@ -605,27 +622,29 @@ func bufferPoolBucketIndexAndSizeOfPut(size int) (int, int) { } var ( - buffers bufferPool + buffers bufferPool[byte] + indexes bufferPool[int32] + offsets bufferPool[uint32] ) type bufferedPage struct { Page - values *buffer - offsets *buffer - repetitionLevels *buffer - definitionLevels *buffer + offsets *buffer[uint32] + values *buffer[byte] + repetitionLevels *buffer[byte] + definitionLevels *buffer[byte] } -func newBufferedPage(page Page, values, offsets, definitionLevels, repetitionLevels *buffer) *bufferedPage { +func newBufferedPage(page Page, offsets *buffer[uint32], values *buffer[byte], definitionLevels, repetitionLevels *buffer[byte]) *bufferedPage { p := &bufferedPage{ Page: page, - values: values, offsets: offsets, + values: values, definitionLevels: definitionLevels, repetitionLevels: repetitionLevels, } - bufferRef(values) bufferRef(offsets) + bufferRef(values) bufferRef(definitionLevels) bufferRef(repetitionLevels) return p @@ -634,34 +653,36 @@ func newBufferedPage(page Page, values, offsets, definitionLevels, repetitionLev func (p *bufferedPage) Slice(i, j int64) Page { return newBufferedPage( p.Page.Slice(i, j), - p.values, p.offsets, + p.values, p.definitionLevels, p.repetitionLevels, ) } func (p *bufferedPage) Retain() { - bufferRef(p.values) + Retain(p.Page) bufferRef(p.offsets) + bufferRef(p.values) bufferRef(p.definitionLevels) bufferRef(p.repetitionLevels) } func (p *bufferedPage) Release() { - bufferUnref(p.values) + Release(p.Page) bufferUnref(p.offsets) + bufferUnref(p.values) bufferUnref(p.definitionLevels) bufferUnref(p.repetitionLevels) } -func bufferRef(buf *buffer) { +func bufferRef[T bufferedType](buf *buffer[T]) { if buf != nil { buf.ref() } } -func bufferUnref(buf *buffer) { +func bufferUnref[T bufferedType](buf *buffer[T]) { if buf != nil { buf.unref() } diff --git a/vendor/github.com/parquet-go/parquet-go/buffer_pool.go b/vendor/github.com/parquet-go/parquet-go/buffer_pool.go index 2dfc158be81..3dce799d3e5 100644 --- a/vendor/github.com/parquet-go/parquet-go/buffer_pool.go +++ b/vendor/github.com/parquet-go/parquet-go/buffer_pool.go @@ -326,8 +326,8 @@ func (pool *fileBufferPool) GetBuffer() io.ReadWriteSeeker { func (pool *fileBufferPool) PutBuffer(buf io.ReadWriteSeeker) { if f, _ := buf.(*os.File); f != nil { - defer f.Close() - os.Remove(f.Name()) + _ = f.Close() + _ = os.Remove(f.Name()) } } diff --git a/vendor/github.com/parquet-go/parquet-go/column.go b/vendor/github.com/parquet-go/parquet-go/column.go index 5ce6a5956b1..99743272dbd 100644 --- a/vendor/github.com/parquet-go/parquet-go/column.go +++ b/vendor/github.com/parquet-go/parquet-go/column.go @@ -5,12 +5,12 @@ import ( "fmt" "io" "reflect" + "strings" "github.com/parquet-go/parquet-go/compress" "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) // Column represents a column in a parquet file. @@ -193,7 +193,7 @@ func (c *Column) Value(base reflect.Value) reflect.Value { func (c *Column) String() string { return c.path.String() + ": " + sprint(c.Name(), c) } func (c *Column) forEachLeaf(do func(*Column)) { - if len(c.columns) == 0 { + if isLeafSchemaElement(c.schema) { do(c) } else { for _, child := range c.columns { @@ -251,11 +251,13 @@ func (c *Column) setLevels(depth, repetition, definition, index int) (int, error c.maxDefinitionLevel = byte(definition) depth++ - if len(c.columns) > 0 { - c.index = -1 - } else { + // Only leaf columns get a column index. + if isLeafSchemaElement(c.schema) { c.index = int16(index) index++ + } else { + // Groups (including empty groups) don't get a column index + c.index = -1 } var err error @@ -283,7 +285,7 @@ func (cl *columnLoader) open(file *File, metadata *format.FileMetaData, columnIn cl.schemaIndex++ numChildren := int(c.schema.NumChildren) - if numChildren == 0 { + if isLeafSchemaElement(c.schema) { c.typ = schemaElementTypeOf(c.schema) if cl.columnOrderIndex < len(metadata.ColumnOrders) { @@ -381,6 +383,17 @@ func (cl *columnLoader) open(file *File, metadata *format.FileMetaData, columnIn return c, nil } +// isLeafSchemaElement returns true if the schema element represents a leaf node +// (a column with actual data). According to the Parquet specification, the Type +// field is set for leaf nodes and not set (nil) for group nodes. +// +// This is the authoritative way to distinguish between: +// - Leaf nodes: Type != nil (has column data) +// - Group nodes: Type == nil (including empty groups with NumChildren == 0) +func isLeafSchemaElement(element *format.SchemaElement) bool { + return element.Type != nil +} + func schemaElementTypeOf(s *format.SchemaElement) Type { if lt := s.LogicalType; lt != nil { // A logical type exists, the Type interface implementations in this @@ -554,7 +567,7 @@ func schemaRepetitionTypeOf(s *format.SchemaElement) format.FieldRepetitionType return format.Required } -func (c *Column) decompress(compressedPageData []byte, uncompressedPageSize int32) (page *buffer, err error) { +func (c *Column) decompress(compressedPageData []byte, uncompressedPageSize int32) (page *buffer[byte], err error) { page = buffers.get(int(uncompressedPageSize)) page.data, err = c.compression.Decode(page.data, compressedPageData) if err != nil { @@ -567,10 +580,10 @@ func (c *Column) decompress(compressedPageData []byte, uncompressedPageSize int3 // DecodeDataPageV1 decodes a data page from the header, compressed data, and // optional dictionary passed as arguments. func (c *Column) DecodeDataPageV1(header DataPageHeaderV1, page []byte, dict Dictionary) (Page, error) { - return c.decodeDataPageV1(header, &buffer{data: page}, dict, -1) + return c.decodeDataPageV1(header, newBuffer(page), dict, -1) } -func (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *buffer, dict Dictionary, size int32) (Page, error) { +func (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *buffer[byte], dict Dictionary, size int32) (Page, error) { var pageData = page.data var err error @@ -583,8 +596,8 @@ func (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *buffer, dict Di } var numValues = int(header.NumValues()) - var repetitionLevels *buffer - var definitionLevels *buffer + var repetitionLevels *buffer[byte] + var definitionLevels *buffer[byte] if c.maxRepetitionLevel > 0 { encoding := lookupLevelEncoding(header.RepetitionLevelEncoding(), c.maxRepetitionLevel) @@ -614,15 +627,16 @@ func (c *Column) decodeDataPageV1(header DataPageHeaderV1, page *buffer, dict Di // DecodeDataPageV2 decodes a data page from the header, compressed data, and // optional dictionary passed as arguments. func (c *Column) DecodeDataPageV2(header DataPageHeaderV2, page []byte, dict Dictionary) (Page, error) { - return c.decodeDataPageV2(header, &buffer{data: page}, dict, -1) + return c.decodeDataPageV2(header, newBuffer(page), dict, -1) } -func (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *buffer, dict Dictionary, size int32) (Page, error) { +func (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *buffer[byte], dict Dictionary, size int32) (Page, error) { var numValues = int(header.NumValues()) var pageData = page.data var err error - var repetitionLevels *buffer - var definitionLevels *buffer + + var repetitionLevels *buffer[byte] + var definitionLevels *buffer[byte] if length := header.RepetitionLevelsByteLength(); length > 0 { if c.maxRepetitionLevel == 0 { @@ -676,9 +690,10 @@ func (c *Column) decodeDataPageV2(header DataPageHeaderV2, page *buffer, dict Di return c.decodeDataPage(header, numValues, repetitionLevels, definitionLevels, page, pageData, dict) } -func (c *Column) decodeDataPage(header DataPageHeader, numValues int, repetitionLevels, definitionLevels, page *buffer, data []byte, dict Dictionary) (Page, error) { +func (c *Column) decodeDataPage(header DataPageHeader, numValues int, repetitionLevels, definitionLevels, page *buffer[byte], data []byte, dict Dictionary) (Page, error) { pageEncoding := LookupEncoding(header.Encoding()) pageType := c.Type() + pageKind := pageType.Kind() if isDictionaryEncoding(pageEncoding) { // In some legacy configurations, the PLAIN_DICTIONARY encoding is used @@ -689,24 +704,30 @@ func (c *Column) decodeDataPage(header DataPageHeader, numValues int, repetition pageType = indexedPageType{newIndexedType(pageType, dict)} } - var vbuf, obuf *buffer - var pageValues []byte + var obuf *buffer[uint32] + var vbuf *buffer[byte] var pageOffsets []uint32 - - if pageEncoding.CanDecodeInPlace() { + var pageValues []byte + // For ByteArray/FixedLenByteArray columns, use a heap buffer because the + // content might be retained by the application after reading the pages, + // which would cause memory corruption if the buffers were reused. + switch { + case pageKind == ByteArray || pageKind == FixedLenByteArray: + pageValues = make([]byte, pageType.EstimateDecodeSize(numValues, data, pageEncoding)) + case pageEncoding.CanDecodeInPlace(): vbuf = page pageValues = data - } else { + default: vbuf = buffers.get(pageType.EstimateDecodeSize(numValues, data, pageEncoding)) defer vbuf.unref() pageValues = vbuf.data } // Page offsets not needed when dictionary-encoded - if pageType.Kind() == ByteArray && !isDictionaryEncoding(pageEncoding) { - obuf = buffers.get(4 * (numValues + 1)) + if pageKind == ByteArray && !isDictionaryEncoding(pageEncoding) { + obuf = offsets.get(numValues + 1) defer obuf.unref() - pageOffsets = unsafecast.Slice[uint32](obuf.data) + pageOffsets = obuf.data } values := pageType.NewValues(pageValues, pageOffsets) @@ -733,10 +754,10 @@ func (c *Column) decodeDataPage(header DataPageHeader, numValues int, repetition ) } - return newBufferedPage(newPage, vbuf, obuf, definitionLevels, repetitionLevels), nil + return newBufferedPage(newPage, obuf, vbuf, definitionLevels, repetitionLevels), nil } -func decodeLevelsV1(enc encoding.Encoding, numValues int, data []byte) (*buffer, []byte, error) { +func decodeLevelsV1(enc encoding.Encoding, numValues int, data []byte) (*buffer[byte], []byte, error) { if len(data) < 4 { return nil, data, io.ErrUnexpectedEOF } @@ -749,12 +770,12 @@ func decodeLevelsV1(enc encoding.Encoding, numValues int, data []byte) (*buffer, return levels, data[j:], err } -func decodeLevelsV2(enc encoding.Encoding, numValues int, data []byte, length int64) (*buffer, []byte, error) { +func decodeLevelsV2(enc encoding.Encoding, numValues int, data []byte, length int64) (*buffer[byte], []byte, error) { levels, err := decodeLevels(enc, numValues, data[:length]) return levels, data[length:], err } -func decodeLevels(enc encoding.Encoding, numValues int, data []byte) (levels *buffer, err error) { +func decodeLevels(enc encoding.Encoding, numValues int, data []byte) (levels *buffer[byte], err error) { levels = buffers.get(numValues) levels.data, err = enc.DecodeLevels(levels.data, data) if err != nil { @@ -781,10 +802,10 @@ func skipLevelsV2(data []byte, length int64) ([]byte, error) { // DecodeDictionary decodes a data page from the header and compressed data // passed as arguments. func (c *Column) DecodeDictionary(header DictionaryPageHeader, page []byte) (Dictionary, error) { - return c.decodeDictionary(header, &buffer{data: page}, -1) + return c.decodeDictionary(header, newBuffer(page), -1) } -func (c *Column) decodeDictionary(header DictionaryPageHeader, page *buffer, size int32) (Dictionary, error) { +func (c *Column) decodeDictionary(header DictionaryPageHeader, page *buffer[byte], size int32) (Dictionary, error) { pageData := page.data if isCompressed(c.compression) { @@ -816,3 +837,58 @@ func (c *Column) decodeDictionary(header DictionaryPageHeader, page *buffer, siz var ( _ Node = (*Column)(nil) ) + +func validateColumns(t reflect.Type) (string, bool) { + // Only validate struct types + if t.Kind() != reflect.Struct { + return "", true + } + + var ( + field reflect.StructField + fieldType reflect.Type + fieldTag string + ) + + columns := make(map[string]reflect.Type, t.NumField()) + + for i := range t.NumField() { + field = t.Field(i) + + // Skip unexported fields + if !field.IsExported() { + continue + } + + fieldType = field.Type + fieldTag = field.Tag.Get("parquet") + + // Determine the actual column name using the same logic as schema generation + columnName := field.Name + if fieldTag != "" { + // Split tag by comma to get the name part + if commaIdx := strings.IndexByte(fieldTag, ','); commaIdx >= 0 { + fieldTag = fieldTag[:commaIdx] + } + // Check if field is skipped + if fieldTag == "-" { + continue + } + // Use tag name if non-empty + if fieldTag != "" { + columnName = fieldTag + } + } + + if val, ok := columns[columnName]; ok { + if val == fieldType { + continue + } + return columnName, false + } else { + columns[columnName] = fieldType + } + } + + return "", true +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer.go b/vendor/github.com/parquet-go/parquet-go/column_buffer.go index c2161a3589e..31f31de467e 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_buffer.go +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer.go @@ -1,22 +1,6 @@ package parquet import ( - "bytes" - "cmp" - "encoding/json" - "fmt" - "io" - "math/bits" - "reflect" - "slices" - "sort" - "time" - "unsafe" - - "github.com/parquet-go/parquet-go/deprecated" - "github.com/parquet-go/parquet-go/encoding/plain" - "github.com/parquet-go/parquet-go/internal/bitpack" - "github.com/parquet-go/parquet-go/internal/unsafecast" "github.com/parquet-go/parquet-go/sparse" ) @@ -142,2383 +126,18 @@ type reversedColumnBuffer struct{ ColumnBuffer } func (col *reversedColumnBuffer) Less(i, j int) bool { return col.ColumnBuffer.Less(j, i) } -// optionalColumnBuffer is an implementation of the ColumnBuffer interface used -// as a wrapper to an underlying ColumnBuffer to manage the creation of -// definition levels. -// -// Null values are not written to the underlying column; instead, the buffer -// tracks offsets of row values in the column, null row values are represented -// by the value -1 and a definition level less than the max. -// -// This column buffer type is used for all leaf columns that have a non-zero -// max definition level and a zero repetition level, which may be because the -// column or one of its parent(s) are marked optional. -type optionalColumnBuffer struct { - base ColumnBuffer - reordered bool - maxDefinitionLevel byte - rows []int32 - sortIndex []int32 - definitionLevels []byte - nullOrdering nullOrdering -} - -func newOptionalColumnBuffer(base ColumnBuffer, maxDefinitionLevel byte, nullOrdering nullOrdering) *optionalColumnBuffer { - n := base.Cap() - return &optionalColumnBuffer{ - base: base, - maxDefinitionLevel: maxDefinitionLevel, - rows: make([]int32, 0, n), - definitionLevels: make([]byte, 0, n), - nullOrdering: nullOrdering, - } -} - -func (col *optionalColumnBuffer) Clone() ColumnBuffer { - return &optionalColumnBuffer{ - base: col.base.Clone(), - reordered: col.reordered, - maxDefinitionLevel: col.maxDefinitionLevel, - rows: slices.Clone(col.rows), - definitionLevels: slices.Clone(col.definitionLevels), - nullOrdering: col.nullOrdering, - } -} - -func (col *optionalColumnBuffer) Type() Type { - return col.base.Type() -} - -func (col *optionalColumnBuffer) NumValues() int64 { - return int64(len(col.definitionLevels)) -} - -func (col *optionalColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels) -} - -func (col *optionalColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return col.base.OffsetIndex() -} - -func (col *optionalColumnBuffer) BloomFilter() BloomFilter { - return col.base.BloomFilter() -} - -func (col *optionalColumnBuffer) Dictionary() Dictionary { - return col.base.Dictionary() -} - -func (col *optionalColumnBuffer) Column() int { - return col.base.Column() -} - -func (col *optionalColumnBuffer) Pages() Pages { - return onePage(col.Page()) -} - -func (col *optionalColumnBuffer) Page() Page { - // No need for any cyclic sorting if the rows have not been reordered. - // This case is also important because the cyclic sorting modifies the - // buffer which makes it unsafe to read the buffer concurrently. - if col.reordered { - numNulls := countLevelsNotEqual(col.definitionLevels, col.maxDefinitionLevel) - numValues := len(col.rows) - numNulls - - if numValues > 0 { - if cap(col.sortIndex) < numValues { - col.sortIndex = make([]int32, numValues) - } - sortIndex := col.sortIndex[:numValues] - i := 0 - for _, j := range col.rows { - if j >= 0 { - sortIndex[j] = int32(i) - i++ - } - } - - // Cyclic sort: O(N) - for i := range sortIndex { - for j := int(sortIndex[i]); i != j; j = int(sortIndex[i]) { - col.base.Swap(i, j) - sortIndex[i], sortIndex[j] = sortIndex[j], sortIndex[i] - } - } - } - - i := 0 - for _, r := range col.rows { - if r >= 0 { - col.rows[i] = int32(i) - i++ - } - } - - col.reordered = false - } - - return newOptionalPage(col.base.Page(), col.maxDefinitionLevel, col.definitionLevels) -} - -func (col *optionalColumnBuffer) Reset() { - col.base.Reset() - col.rows = col.rows[:0] - col.definitionLevels = col.definitionLevels[:0] -} - -func (col *optionalColumnBuffer) Size() int64 { - return int64(4*len(col.rows)+4*len(col.sortIndex)+len(col.definitionLevels)) + col.base.Size() -} - -func (col *optionalColumnBuffer) Cap() int { return cap(col.rows) } - -func (col *optionalColumnBuffer) Len() int { return len(col.rows) } - -func (col *optionalColumnBuffer) Less(i, j int) bool { - return col.nullOrdering( - col.base, - int(col.rows[i]), - int(col.rows[j]), - col.maxDefinitionLevel, - col.definitionLevels[i], - col.definitionLevels[j], - ) -} - -func (col *optionalColumnBuffer) Swap(i, j int) { - // Because the underlying column does not contain null values, we cannot - // swap its values at indexes i and j. We swap the row indexes only, then - // reorder the underlying buffer using a cyclic sort when the buffer is - // materialized into a page view. - col.reordered = true - col.rows[i], col.rows[j] = col.rows[j], col.rows[i] - col.definitionLevels[i], col.definitionLevels[j] = col.definitionLevels[j], col.definitionLevels[i] -} - -func (col *optionalColumnBuffer) WriteValues(values []Value) (n int, err error) { - rowIndex := int32(col.base.Len()) - - for n < len(values) { - // Collect index range of contiguous null values, from i to n. If this - // for loop exhausts the values, all remaining if statements and for - // loops will be no-ops and the loop will terminate. - i := n - for n < len(values) && values[n].definitionLevel != col.maxDefinitionLevel { - n++ - } - - // Write the contiguous null values up until the first non-null value - // obtained in the for loop above. - for _, v := range values[i:n] { - col.rows = append(col.rows, -1) - col.definitionLevels = append(col.definitionLevels, v.definitionLevel) - } - - // Collect index range of contiguous non-null values, from i to n. - i = n - for n < len(values) && values[n].definitionLevel == col.maxDefinitionLevel { - n++ - } - - // As long as i < n we have non-null values still to write. It is - // possible that we just exhausted the input values in which case i == n - // and the outer for loop will terminate. - if i < n { - count, err := col.base.WriteValues(values[i:n]) - col.definitionLevels = appendLevel(col.definitionLevels, col.maxDefinitionLevel, count) - - for count > 0 { - col.rows = append(col.rows, rowIndex) - rowIndex++ - count-- - } - - if err != nil { - return n, err - } - } - } - return n, nil -} - -func (col *optionalColumnBuffer) writeValues(rows sparse.Array, levels columnLevels) { - // The row count is zero when writing an null optional value, in which case - // we still need to output a row to the buffer to record the definition - // level. - if rows.Len() == 0 { - col.definitionLevels = append(col.definitionLevels, levels.definitionLevel) - col.rows = append(col.rows, -1) - return - } - - col.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, rows.Len()) - - i := len(col.rows) - j := len(col.rows) + rows.Len() - - if j <= cap(col.rows) { - col.rows = col.rows[:j] - } else { - tmp := make([]int32, j, 2*j) - copy(tmp, col.rows) - col.rows = tmp - } - - if levels.definitionLevel != col.maxDefinitionLevel { - broadcastValueInt32(col.rows[i:], -1) - } else { - broadcastRangeInt32(col.rows[i:], int32(col.base.Len())) - col.base.writeValues(rows, levels) - } -} - -func (col *optionalColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) { - length := int64(len(col.definitionLevels)) - if offset < 0 { - return 0, errRowIndexOutOfBounds(offset, length) - } - if offset >= length { - return 0, io.EOF - } - if length -= offset; length < int64(len(values)) { - values = values[:length] - } - - numNulls1 := int64(countLevelsNotEqual(col.definitionLevels[:offset], col.maxDefinitionLevel)) - numNulls2 := int64(countLevelsNotEqual(col.definitionLevels[offset:offset+length], col.maxDefinitionLevel)) - - if numNulls2 < length { - n, err := col.base.ReadValuesAt(values[:length-numNulls2], offset-numNulls1) - if err != nil { - return n, err - } - } - - if numNulls2 > 0 { - columnIndex := ^int16(col.Column()) - i := numNulls2 - 1 - j := length - 1 - definitionLevels := col.definitionLevels[offset : offset+length] - maxDefinitionLevel := col.maxDefinitionLevel - - for n := len(definitionLevels) - 1; n >= 0 && j > i; n-- { - if definitionLevels[n] != maxDefinitionLevel { - values[j] = Value{definitionLevel: definitionLevels[n], columnIndex: columnIndex} - } else { - values[j] = values[i] - i-- - } - j-- - } - } - - return int(length), nil -} - -// repeatedColumnBuffer is an implementation of the ColumnBuffer interface used -// as a wrapper to an underlying ColumnBuffer to manage the creation of -// repetition levels, definition levels, and map rows to the region of the -// underlying buffer that contains their sequence of values. -// -// Null values are not written to the underlying column; instead, the buffer -// tracks offsets of row values in the column, null row values are represented -// by the value -1 and a definition level less than the max. -// -// This column buffer type is used for all leaf columns that have a non-zero -// max repetition level, which may be because the column or one of its parent(s) -// are marked repeated. -type repeatedColumnBuffer struct { - base ColumnBuffer - reordered bool - maxRepetitionLevel byte - maxDefinitionLevel byte - rows []offsetMapping - repetitionLevels []byte - definitionLevels []byte - buffer []Value - reordering *repeatedColumnBuffer - nullOrdering nullOrdering -} - -// The offsetMapping type maps the logical offset of rows within the repetition -// and definition levels, to the base offsets in the underlying column buffers -// where the non-null values have been written. -type offsetMapping struct { - offset uint32 - baseOffset uint32 -} - -func newRepeatedColumnBuffer(base ColumnBuffer, maxRepetitionLevel, maxDefinitionLevel byte, nullOrdering nullOrdering) *repeatedColumnBuffer { - n := base.Cap() - return &repeatedColumnBuffer{ - base: base, - maxRepetitionLevel: maxRepetitionLevel, - maxDefinitionLevel: maxDefinitionLevel, - rows: make([]offsetMapping, 0, n/8), - repetitionLevels: make([]byte, 0, n), - definitionLevels: make([]byte, 0, n), - nullOrdering: nullOrdering, - } -} - -func (col *repeatedColumnBuffer) Clone() ColumnBuffer { - return &repeatedColumnBuffer{ - base: col.base.Clone(), - reordered: col.reordered, - maxRepetitionLevel: col.maxRepetitionLevel, - maxDefinitionLevel: col.maxDefinitionLevel, - rows: slices.Clone(col.rows), - repetitionLevels: slices.Clone(col.repetitionLevels), - definitionLevels: slices.Clone(col.definitionLevels), - nullOrdering: col.nullOrdering, - } -} - -func (col *repeatedColumnBuffer) Type() Type { - return col.base.Type() -} - -func (col *repeatedColumnBuffer) NumValues() int64 { - return int64(len(col.definitionLevels)) -} - -func (col *repeatedColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels) -} - -func (col *repeatedColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return col.base.OffsetIndex() -} - -func (col *repeatedColumnBuffer) BloomFilter() BloomFilter { - return col.base.BloomFilter() -} - -func (col *repeatedColumnBuffer) Dictionary() Dictionary { - return col.base.Dictionary() -} - -func (col *repeatedColumnBuffer) Column() int { - return col.base.Column() -} - -func (col *repeatedColumnBuffer) Pages() Pages { - return onePage(col.Page()) -} - -func (col *repeatedColumnBuffer) Page() Page { - if col.reordered { - if col.reordering == nil { - col.reordering = col.Clone().(*repeatedColumnBuffer) - } - - column := col.reordering - column.Reset() - maxNumValues := 0 - defer func() { - clearValues(col.buffer[:maxNumValues]) - }() - - baseOffset := 0 - - for _, row := range col.rows { - rowOffset := int(row.offset) - rowLength := repeatedRowLength(col.repetitionLevels[rowOffset:]) - numNulls := countLevelsNotEqual(col.definitionLevels[rowOffset:rowOffset+rowLength], col.maxDefinitionLevel) - numValues := rowLength - numNulls - - if numValues > 0 { - if numValues > cap(col.buffer) { - col.buffer = make([]Value, numValues) - } else { - col.buffer = col.buffer[:numValues] - } - n, err := col.base.ReadValuesAt(col.buffer, int64(row.baseOffset)) - if err != nil && n < numValues { - return newErrorPage(col.Type(), col.Column(), "reordering rows of repeated column: %w", err) - } - if _, err := column.base.WriteValues(col.buffer); err != nil { - return newErrorPage(col.Type(), col.Column(), "reordering rows of repeated column: %w", err) - } - if numValues > maxNumValues { - maxNumValues = numValues - } - } - - column.rows = append(column.rows, offsetMapping{ - offset: uint32(len(column.repetitionLevels)), - baseOffset: uint32(baseOffset), - }) - - column.repetitionLevels = append(column.repetitionLevels, col.repetitionLevels[rowOffset:rowOffset+rowLength]...) - column.definitionLevels = append(column.definitionLevels, col.definitionLevels[rowOffset:rowOffset+rowLength]...) - baseOffset += numValues - } - - col.swapReorderingBuffer(column) - col.reordered = false - } - - return newRepeatedPage( - col.base.Page(), - col.maxRepetitionLevel, - col.maxDefinitionLevel, - col.repetitionLevels, - col.definitionLevels, - ) -} - -func (col *repeatedColumnBuffer) swapReorderingBuffer(buf *repeatedColumnBuffer) { - col.base, buf.base = buf.base, col.base - col.rows, buf.rows = buf.rows, col.rows - col.repetitionLevels, buf.repetitionLevels = buf.repetitionLevels, col.repetitionLevels - col.definitionLevels, buf.definitionLevels = buf.definitionLevels, col.definitionLevels -} - -func (col *repeatedColumnBuffer) Reset() { - col.base.Reset() - col.rows = col.rows[:0] - col.repetitionLevels = col.repetitionLevels[:0] - col.definitionLevels = col.definitionLevels[:0] -} - -func (col *repeatedColumnBuffer) Size() int64 { - return int64(8*len(col.rows)+len(col.repetitionLevels)+len(col.definitionLevels)) + col.base.Size() -} - -func (col *repeatedColumnBuffer) Cap() int { return cap(col.rows) } - -func (col *repeatedColumnBuffer) Len() int { return len(col.rows) } - -func (col *repeatedColumnBuffer) Less(i, j int) bool { - row1 := col.rows[i] - row2 := col.rows[j] - less := col.nullOrdering - row1Length := repeatedRowLength(col.repetitionLevels[row1.offset:]) - row2Length := repeatedRowLength(col.repetitionLevels[row2.offset:]) - - for k := 0; k < row1Length && k < row2Length; k++ { - x := int(row1.baseOffset) - y := int(row2.baseOffset) - definitionLevel1 := col.definitionLevels[int(row1.offset)+k] - definitionLevel2 := col.definitionLevels[int(row2.offset)+k] - switch { - case less(col.base, x, y, col.maxDefinitionLevel, definitionLevel1, definitionLevel2): - return true - case less(col.base, y, x, col.maxDefinitionLevel, definitionLevel2, definitionLevel1): - return false - } - } - - return row1Length < row2Length -} - -func (col *repeatedColumnBuffer) Swap(i, j int) { - // Because the underlying column does not contain null values, and may hold - // an arbitrary number of values per row, we cannot swap its values at - // indexes i and j. We swap the row indexes only, then reorder the base - // column buffer when its view is materialized into a page by creating a - // copy and writing rows back to it following the order of rows in the - // repeated column buffer. - col.reordered = true - col.rows[i], col.rows[j] = col.rows[j], col.rows[i] -} - -func (col *repeatedColumnBuffer) WriteValues(values []Value) (numValues int, err error) { - maxRowLen := 0 - defer func() { - clearValues(col.buffer[:maxRowLen]) - }() - - for i := 0; i < len(values); { - j := i - - if values[j].repetitionLevel == 0 { - j++ - } - - for j < len(values) && values[j].repetitionLevel != 0 { - j++ - } - - if err := col.writeRow(values[i:j]); err != nil { - return numValues, err - } - - if len(col.buffer) > maxRowLen { - maxRowLen = len(col.buffer) - } - - numValues += j - i - i = j - } - - return numValues, nil -} - -func (col *repeatedColumnBuffer) writeRow(row []Value) error { - col.buffer = col.buffer[:0] - - for _, v := range row { - if v.definitionLevel == col.maxDefinitionLevel { - col.buffer = append(col.buffer, v) - } - } - - baseOffset := col.base.NumValues() - if len(col.buffer) > 0 { - if _, err := col.base.WriteValues(col.buffer); err != nil { - return err - } - } - - if row[0].repetitionLevel == 0 { - col.rows = append(col.rows, offsetMapping{ - offset: uint32(len(col.repetitionLevels)), - baseOffset: uint32(baseOffset), - }) - } - - for _, v := range row { - col.repetitionLevels = append(col.repetitionLevels, v.repetitionLevel) - col.definitionLevels = append(col.definitionLevels, v.definitionLevel) - } - - return nil -} - -func (col *repeatedColumnBuffer) writeValues(row sparse.Array, levels columnLevels) { - if levels.repetitionLevel == 0 { - col.rows = append(col.rows, offsetMapping{ - offset: uint32(len(col.repetitionLevels)), - baseOffset: uint32(col.base.NumValues()), - }) - } - - if row.Len() == 0 { - col.repetitionLevels = append(col.repetitionLevels, levels.repetitionLevel) - col.definitionLevels = append(col.definitionLevels, levels.definitionLevel) - return - } - - col.repetitionLevels = appendLevel(col.repetitionLevels, levels.repetitionLevel, row.Len()) - col.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, row.Len()) - - if levels.definitionLevel == col.maxDefinitionLevel { - col.base.writeValues(row, levels) - } -} - -func (col *repeatedColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) { - // TODO: - panic("NOT IMPLEMENTED") -} - -// repeatedRowLength gives the length of the repeated row starting at the -// beginning of the repetitionLevels slice. -func repeatedRowLength(repetitionLevels []byte) int { - // If a repetition level exists, at least one value is required to represent - // the column. - if len(repetitionLevels) > 0 { - // The subsequent levels will represent the start of a new record when - // they go back to zero. - if i := bytes.IndexByte(repetitionLevels[1:], 0); i >= 0 { - return i + 1 - } - } - return len(repetitionLevels) -} - -// ============================================================================= -// The types below are in-memory implementations of the ColumnBuffer interface -// for each parquet type. -// -// These column buffers are created by calling NewColumnBuffer on parquet.Type -// instances; each parquet type manages to construct column buffers of the -// appropriate type, which ensures that we are packing as many values as we -// can in memory. -// -// See Type.NewColumnBuffer for details about how these types get created. -// ============================================================================= - -type booleanColumnBuffer struct{ booleanPage } - -func newBooleanColumnBuffer(typ Type, columnIndex int16, numValues int32) *booleanColumnBuffer { - // Boolean values are bit-packed, we can fit up to 8 values per byte. - bufferSize := (numValues + 7) / 8 - return &booleanColumnBuffer{ - booleanPage: booleanPage{ - typ: typ, - bits: make([]byte, 0, bufferSize), - columnIndex: ^columnIndex, - }, - } -} - -func (col *booleanColumnBuffer) Clone() ColumnBuffer { - return &booleanColumnBuffer{ - booleanPage: booleanPage{ - typ: col.typ, - bits: slices.Clone(col.bits), - offset: col.offset, - numValues: col.numValues, - columnIndex: col.columnIndex, - }, - } -} - -func (col *booleanColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return booleanColumnIndex{&col.booleanPage}, nil -} - -func (col *booleanColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return booleanOffsetIndex{&col.booleanPage}, nil -} - -func (col *booleanColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *booleanColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *booleanColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *booleanColumnBuffer) Page() Page { return &col.booleanPage } - -func (col *booleanColumnBuffer) Reset() { - col.bits = col.bits[:0] - col.offset = 0 - col.numValues = 0 -} - -func (col *booleanColumnBuffer) Cap() int { return 8 * cap(col.bits) } - -func (col *booleanColumnBuffer) Len() int { return int(col.numValues) } - -func (col *booleanColumnBuffer) Less(i, j int) bool { - a := col.valueAt(i) - b := col.valueAt(j) - return a != b && !a -} - -func (col *booleanColumnBuffer) valueAt(i int) bool { - j := uint32(i) / 8 - k := uint32(i) % 8 - return ((col.bits[j] >> k) & 1) != 0 -} - -func (col *booleanColumnBuffer) setValueAt(i int, v bool) { - // `offset` is always zero in the page of a column buffer - j := uint32(i) / 8 - k := uint32(i) % 8 - x := byte(0) - if v { - x = 1 - } - col.bits[j] = (col.bits[j] & ^(1 << k)) | (x << k) -} - -func (col *booleanColumnBuffer) Swap(i, j int) { - a := col.valueAt(i) - b := col.valueAt(j) - col.setValueAt(i, b) - col.setValueAt(j, a) -} - -func (col *booleanColumnBuffer) WriteBooleans(values []bool) (int, error) { - col.writeValues(sparse.MakeBoolArray(values).UnsafeArray(), columnLevels{}) - return len(values), nil -} - -func (col *booleanColumnBuffer) WriteValues(values []Value) (int, error) { - col.writeValues(makeArrayValue(values, offsetOfBool), columnLevels{}) - return len(values), nil -} - -func (col *booleanColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - numBytes := bitpack.ByteCount(uint(col.numValues) + uint(rows.Len())) - if cap(col.bits) < numBytes { - col.bits = append(make([]byte, 0, max(numBytes, 2*cap(col.bits))), col.bits...) - } - col.bits = col.bits[:numBytes] - i := 0 - r := 8 - (int(col.numValues) % 8) - bytes := rows.Uint8Array() - - if r <= bytes.Len() { - // First we attempt to write enough bits to align the number of values - // in the column buffer on 8 bytes. After this step the next bit should - // be written at the zero'th index of a byte of the buffer. - if r < 8 { - var b byte - for i < r { - v := bytes.Index(i) - b |= (v & 1) << uint(i) - i++ - } - x := uint(col.numValues) / 8 - y := uint(col.numValues) % 8 - col.bits[x] = (b << y) | (col.bits[x] & ^(0xFF << y)) - col.numValues += int32(i) - } - - if n := ((bytes.Len() - i) / 8) * 8; n > 0 { - // At this stage, we know that that we have at least 8 bits to write - // and the bits will be aligned on the address of a byte in the - // output buffer. We can work on 8 values per loop iteration, - // packing them into a single byte and writing it to the output - // buffer. This effectively reduces by 87.5% the number of memory - // stores that the program needs to perform to generate the values. - i += sparse.GatherBits(col.bits[col.numValues/8:], bytes.Slice(i, i+n)) - col.numValues += int32(n) - } - } - - for i < bytes.Len() { - x := uint(col.numValues) / 8 - y := uint(col.numValues) % 8 - b := bytes.Index(i) - col.bits[x] = ((b & 1) << y) | (col.bits[x] & ^(1 << y)) - col.numValues++ - i++ - } - - col.bits = col.bits[:bitpack.ByteCount(uint(col.numValues))] -} - -func (col *booleanColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(col.numValues)) - case i >= int(col.numValues): - return 0, io.EOF - default: - for n < len(values) && i < int(col.numValues) { - values[n] = col.makeValue(col.valueAt(i)) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type int32ColumnBuffer struct{ int32Page } - -func newInt32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int32ColumnBuffer { - return &int32ColumnBuffer{ - int32Page: int32Page{ - typ: typ, - values: make([]int32, 0, numValues), - columnIndex: ^columnIndex, - }, - } -} - -func (col *int32ColumnBuffer) Clone() ColumnBuffer { - return &int32ColumnBuffer{ - int32Page: int32Page{ - typ: col.typ, - values: slices.Clone(col.values), - columnIndex: col.columnIndex, - }, - } -} - -func (col *int32ColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return int32ColumnIndex{&col.int32Page}, nil -} - -func (col *int32ColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return int32OffsetIndex{&col.int32Page}, nil -} - -func (col *int32ColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *int32ColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *int32ColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *int32ColumnBuffer) Page() Page { return &col.int32Page } - -func (col *int32ColumnBuffer) Reset() { col.values = col.values[:0] } - -func (col *int32ColumnBuffer) Cap() int { return cap(col.values) } - -func (col *int32ColumnBuffer) Len() int { return len(col.values) } - -func (col *int32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } - -func (col *int32ColumnBuffer) Swap(i, j int) { - col.values[i], col.values[j] = col.values[j], col.values[i] -} - -func (col *int32ColumnBuffer) Write(b []byte) (int, error) { - if (len(b) % 4) != 0 { - return 0, fmt.Errorf("cannot write INT32 values from input of size %d", len(b)) - } - col.values = append(col.values, unsafecast.Slice[int32](b)...) - return len(b), nil -} - -func (col *int32ColumnBuffer) WriteInt32s(values []int32) (int, error) { - col.values = append(col.values, values...) - return len(values), nil -} - -func (col *int32ColumnBuffer) WriteValues(values []Value) (int, error) { - col.writeValues(makeArrayValue(values, offsetOfU32), columnLevels{}) - return len(values), nil -} - -func (col *int32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - if n := len(col.values) + rows.Len(); n > cap(col.values) { - col.values = append(make([]int32, 0, max(n, 2*cap(col.values))), col.values...) - } - n := len(col.values) - col.values = col.values[:n+rows.Len()] - sparse.GatherInt32(col.values[n:], rows.Int32Array()) - -} - -func (col *int32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) - case i >= len(col.values): - return 0, io.EOF - default: - for n < len(values) && i < len(col.values) { - values[n] = col.makeValue(col.values[i]) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type int64ColumnBuffer struct{ int64Page } - -func newInt64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int64ColumnBuffer { - return &int64ColumnBuffer{ - int64Page: int64Page{ - typ: typ, - values: make([]int64, 0, numValues), - columnIndex: ^columnIndex, - }, - } -} - -func (col *int64ColumnBuffer) Clone() ColumnBuffer { - return &int64ColumnBuffer{ - int64Page: int64Page{ - typ: col.typ, - values: slices.Clone(col.values), - columnIndex: col.columnIndex, - }, - } -} - -func (col *int64ColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return int64ColumnIndex{&col.int64Page}, nil -} - -func (col *int64ColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return int64OffsetIndex{&col.int64Page}, nil -} - -func (col *int64ColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *int64ColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *int64ColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *int64ColumnBuffer) Page() Page { return &col.int64Page } - -func (col *int64ColumnBuffer) Reset() { col.values = col.values[:0] } - -func (col *int64ColumnBuffer) Cap() int { return cap(col.values) } - -func (col *int64ColumnBuffer) Len() int { return len(col.values) } - -func (col *int64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } - -func (col *int64ColumnBuffer) Swap(i, j int) { - col.values[i], col.values[j] = col.values[j], col.values[i] -} - -func (col *int64ColumnBuffer) Write(b []byte) (int, error) { - if (len(b) % 8) != 0 { - return 0, fmt.Errorf("cannot write INT64 values from input of size %d", len(b)) - } - col.values = append(col.values, unsafecast.Slice[int64](b)...) - return len(b), nil -} - -func (col *int64ColumnBuffer) WriteInt64s(values []int64) (int, error) { - col.values = append(col.values, values...) - return len(values), nil -} - -func (col *int64ColumnBuffer) WriteValues(values []Value) (int, error) { - col.writeValues(makeArrayValue(values, offsetOfU64), columnLevels{}) - return len(values), nil -} - -func (col *int64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - if n := len(col.values) + rows.Len(); n > cap(col.values) { - col.values = append(make([]int64, 0, max(n, 2*cap(col.values))), col.values...) - } - n := len(col.values) - col.values = col.values[:n+rows.Len()] - sparse.GatherInt64(col.values[n:], rows.Int64Array()) -} - -func (col *int64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) - case i >= len(col.values): - return 0, io.EOF - default: - for n < len(values) && i < len(col.values) { - values[n] = col.makeValue(col.values[i]) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type int96ColumnBuffer struct{ int96Page } - -func newInt96ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int96ColumnBuffer { - return &int96ColumnBuffer{ - int96Page: int96Page{ - typ: typ, - values: make([]deprecated.Int96, 0, numValues), - columnIndex: ^columnIndex, - }, - } -} - -func (col *int96ColumnBuffer) Clone() ColumnBuffer { - return &int96ColumnBuffer{ - int96Page: int96Page{ - typ: col.typ, - values: slices.Clone(col.values), - columnIndex: col.columnIndex, - }, - } -} - -func (col *int96ColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return int96ColumnIndex{&col.int96Page}, nil -} - -func (col *int96ColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return int96OffsetIndex{&col.int96Page}, nil -} - -func (col *int96ColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *int96ColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *int96ColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *int96ColumnBuffer) Page() Page { return &col.int96Page } - -func (col *int96ColumnBuffer) Reset() { col.values = col.values[:0] } - -func (col *int96ColumnBuffer) Cap() int { return cap(col.values) } - -func (col *int96ColumnBuffer) Len() int { return len(col.values) } - -func (col *int96ColumnBuffer) Less(i, j int) bool { return col.values[i].Less(col.values[j]) } - -func (col *int96ColumnBuffer) Swap(i, j int) { - col.values[i], col.values[j] = col.values[j], col.values[i] -} - -func (col *int96ColumnBuffer) Write(b []byte) (int, error) { - if (len(b) % 12) != 0 { - return 0, fmt.Errorf("cannot write INT96 values from input of size %d", len(b)) - } - col.values = append(col.values, unsafecast.Slice[deprecated.Int96](b)...) - return len(b), nil -} - -func (col *int96ColumnBuffer) WriteInt96s(values []deprecated.Int96) (int, error) { - col.values = append(col.values, values...) - return len(values), nil -} - -func (col *int96ColumnBuffer) WriteValues(values []Value) (int, error) { - for _, v := range values { - col.values = append(col.values, v.Int96()) - } - return len(values), nil -} - -func (col *int96ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - for i := range rows.Len() { - p := rows.Index(i) - col.values = append(col.values, *(*deprecated.Int96)(p)) - } -} - -func (col *int96ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) - case i >= len(col.values): - return 0, io.EOF - default: - for n < len(values) && i < len(col.values) { - values[n] = col.makeValue(col.values[i]) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type floatColumnBuffer struct{ floatPage } - -func newFloatColumnBuffer(typ Type, columnIndex int16, numValues int32) *floatColumnBuffer { - return &floatColumnBuffer{ - floatPage: floatPage{ - typ: typ, - values: make([]float32, 0, numValues), - columnIndex: ^columnIndex, - }, - } -} - -func (col *floatColumnBuffer) Clone() ColumnBuffer { - return &floatColumnBuffer{ - floatPage: floatPage{ - typ: col.typ, - values: slices.Clone(col.values), - columnIndex: col.columnIndex, - }, - } -} - -func (col *floatColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return floatColumnIndex{&col.floatPage}, nil -} - -func (col *floatColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return floatOffsetIndex{&col.floatPage}, nil -} - -func (col *floatColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *floatColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *floatColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *floatColumnBuffer) Page() Page { return &col.floatPage } - -func (col *floatColumnBuffer) Reset() { col.values = col.values[:0] } - -func (col *floatColumnBuffer) Cap() int { return cap(col.values) } - -func (col *floatColumnBuffer) Len() int { return len(col.values) } - -func (col *floatColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } - -func (col *floatColumnBuffer) Swap(i, j int) { - col.values[i], col.values[j] = col.values[j], col.values[i] -} - -func (col *floatColumnBuffer) Write(b []byte) (int, error) { - if (len(b) % 4) != 0 { - return 0, fmt.Errorf("cannot write FLOAT values from input of size %d", len(b)) - } - col.values = append(col.values, unsafecast.Slice[float32](b)...) - return len(b), nil -} - -func (col *floatColumnBuffer) WriteFloats(values []float32) (int, error) { - col.values = append(col.values, values...) - return len(values), nil -} - -func (col *floatColumnBuffer) WriteValues(values []Value) (int, error) { - col.writeValues(makeArrayValue(values, offsetOfU32), columnLevels{}) - return len(values), nil -} - -func (col *floatColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - if n := len(col.values) + rows.Len(); n > cap(col.values) { - col.values = append(make([]float32, 0, max(n, 2*cap(col.values))), col.values...) - } - n := len(col.values) - col.values = col.values[:n+rows.Len()] - sparse.GatherFloat32(col.values[n:], rows.Float32Array()) -} - -func (col *floatColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) - case i >= len(col.values): - return 0, io.EOF - default: - for n < len(values) && i < len(col.values) { - values[n] = col.makeValue(col.values[i]) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type doubleColumnBuffer struct{ doublePage } - -func newDoubleColumnBuffer(typ Type, columnIndex int16, numValues int32) *doubleColumnBuffer { - return &doubleColumnBuffer{ - doublePage: doublePage{ - typ: typ, - values: make([]float64, 0, numValues), - columnIndex: ^columnIndex, - }, - } -} - -func (col *doubleColumnBuffer) Clone() ColumnBuffer { - return &doubleColumnBuffer{ - doublePage: doublePage{ - typ: col.typ, - values: slices.Clone(col.values), - columnIndex: col.columnIndex, - }, - } -} - -func (col *doubleColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return doubleColumnIndex{&col.doublePage}, nil -} - -func (col *doubleColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return doubleOffsetIndex{&col.doublePage}, nil -} - -func (col *doubleColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *doubleColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *doubleColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *doubleColumnBuffer) Page() Page { return &col.doublePage } - -func (col *doubleColumnBuffer) Reset() { col.values = col.values[:0] } - -func (col *doubleColumnBuffer) Cap() int { return cap(col.values) } - -func (col *doubleColumnBuffer) Len() int { return len(col.values) } - -func (col *doubleColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } - -func (col *doubleColumnBuffer) Swap(i, j int) { - col.values[i], col.values[j] = col.values[j], col.values[i] -} - -func (col *doubleColumnBuffer) Write(b []byte) (int, error) { - if (len(b) % 8) != 0 { - return 0, fmt.Errorf("cannot write DOUBLE values from input of size %d", len(b)) - } - col.values = append(col.values, unsafecast.Slice[float64](b)...) - return len(b), nil -} - -func (col *doubleColumnBuffer) WriteDoubles(values []float64) (int, error) { - col.values = append(col.values, values...) - return len(values), nil -} - -func (col *doubleColumnBuffer) WriteValues(values []Value) (int, error) { - col.writeValues(makeArrayValue(values, offsetOfU64), columnLevels{}) - return len(values), nil -} - -func (col *doubleColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - if n := len(col.values) + rows.Len(); n > cap(col.values) { - col.values = append(make([]float64, 0, max(n, 2*cap(col.values))), col.values...) - } - n := len(col.values) - col.values = col.values[:n+rows.Len()] - sparse.GatherFloat64(col.values[n:], rows.Float64Array()) -} - -func (col *doubleColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) - case i >= len(col.values): - return 0, io.EOF - default: - for n < len(values) && i < len(col.values) { - values[n] = col.makeValue(col.values[i]) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type byteArrayColumnBuffer struct { - byteArrayPage - lengths []uint32 - scratch []byte -} - -func newByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *byteArrayColumnBuffer { - return &byteArrayColumnBuffer{ - byteArrayPage: byteArrayPage{ - typ: typ, - values: make([]byte, 0, typ.EstimateSize(int(numValues))), - offsets: make([]uint32, 0, numValues+1), - columnIndex: ^columnIndex, - }, - lengths: make([]uint32, 0, numValues), - } -} - -func (col *byteArrayColumnBuffer) Clone() ColumnBuffer { - return &byteArrayColumnBuffer{ - byteArrayPage: byteArrayPage{ - typ: col.typ, - values: col.cloneValues(), - offsets: col.cloneOffsets(), - columnIndex: col.columnIndex, - }, - lengths: col.cloneLengths(), - } -} - -func (col *byteArrayColumnBuffer) cloneLengths() []uint32 { - lengths := make([]uint32, len(col.lengths)) - copy(lengths, col.lengths) - return lengths -} - -func (col *byteArrayColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return byteArrayColumnIndex{col.page()}, nil -} - -func (col *byteArrayColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return byteArrayOffsetIndex{col.page()}, nil -} - -func (col *byteArrayColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *byteArrayColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *byteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *byteArrayColumnBuffer) page() *byteArrayPage { - if len(col.lengths) > 0 && orderOfUint32(col.offsets) < 1 { // unordered? - if cap(col.scratch) < len(col.values) { - col.scratch = make([]byte, 0, cap(col.values)) - } else { - col.scratch = col.scratch[:0] - } - - for i := range col.lengths { - n := len(col.scratch) - col.scratch = append(col.scratch, col.index(i)...) - col.offsets[i] = uint32(n) - } - - col.values, col.scratch = col.scratch, col.values - } - col.offsets = append(col.offsets[:len(col.lengths)], uint32(len(col.values))) - return &col.byteArrayPage -} - -func (col *byteArrayColumnBuffer) Page() Page { - return col.page() -} - -func (col *byteArrayColumnBuffer) Reset() { - col.values = col.values[:0] - col.offsets = col.offsets[:0] - col.lengths = col.lengths[:0] -} - -func (col *byteArrayColumnBuffer) NumRows() int64 { return int64(col.Len()) } - -func (col *byteArrayColumnBuffer) NumValues() int64 { return int64(col.Len()) } - -func (col *byteArrayColumnBuffer) Cap() int { return cap(col.lengths) } - -func (col *byteArrayColumnBuffer) Len() int { return len(col.lengths) } - -func (col *byteArrayColumnBuffer) Less(i, j int) bool { - return bytes.Compare(col.index(i), col.index(j)) < 0 -} - -func (col *byteArrayColumnBuffer) Swap(i, j int) { - col.offsets[i], col.offsets[j] = col.offsets[j], col.offsets[i] - col.lengths[i], col.lengths[j] = col.lengths[j], col.lengths[i] -} - -func (col *byteArrayColumnBuffer) Write(b []byte) (int, error) { - _, n, err := col.writeByteArrays(b) - return n, err -} - -func (col *byteArrayColumnBuffer) WriteByteArrays(values []byte) (int, error) { - n, _, err := col.writeByteArrays(values) - return n, err -} - -func (col *byteArrayColumnBuffer) writeByteArrays(values []byte) (count, bytes int, err error) { - baseCount := len(col.lengths) - baseBytes := len(col.values) + (plain.ByteArrayLengthSize * len(col.lengths)) - - err = plain.RangeByteArray(values, func(value []byte) error { - col.append(unsafecast.String(value)) - return nil - }) - - count = len(col.lengths) - baseCount - bytes = (len(col.values) - baseBytes) + (plain.ByteArrayLengthSize * count) - return count, bytes, err -} - -func (col *byteArrayColumnBuffer) WriteValues(values []Value) (int, error) { - col.writeValues(makeArrayValue(values, offsetOfPtr), columnLevels{}) - return len(values), nil -} - -func (col *byteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - for i := range rows.Len() { - p := rows.Index(i) - col.append(*(*string)(p)) - } -} - -func (col *byteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.lengths))) - case i >= len(col.lengths): - return 0, io.EOF - default: - for n < len(values) && i < len(col.lengths) { - values[n] = col.makeValueBytes(col.index(i)) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -func (col *byteArrayColumnBuffer) append(value string) { - col.offsets = append(col.offsets, uint32(len(col.values))) - col.lengths = append(col.lengths, uint32(len(value))) - col.values = append(col.values, value...) -} - -func (col *byteArrayColumnBuffer) index(i int) []byte { - offset := col.offsets[i] - length := col.lengths[i] - end := offset + length - return col.values[offset:end:end] -} - -type fixedLenByteArrayColumnBuffer struct { - fixedLenByteArrayPage - tmp []byte -} - -func newFixedLenByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *fixedLenByteArrayColumnBuffer { - size := typ.Length() - return &fixedLenByteArrayColumnBuffer{ - fixedLenByteArrayPage: fixedLenByteArrayPage{ - typ: typ, - size: size, - data: make([]byte, 0, typ.EstimateSize(int(numValues))), - columnIndex: ^columnIndex, - }, - tmp: make([]byte, size), - } -} - -func (col *fixedLenByteArrayColumnBuffer) Clone() ColumnBuffer { - return &fixedLenByteArrayColumnBuffer{ - fixedLenByteArrayPage: fixedLenByteArrayPage{ - typ: col.typ, - size: col.size, - data: slices.Clone(col.data), - columnIndex: col.columnIndex, - }, - tmp: make([]byte, col.size), - } -} - -func (col *fixedLenByteArrayColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return fixedLenByteArrayColumnIndex{&col.fixedLenByteArrayPage}, nil -} - -func (col *fixedLenByteArrayColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return fixedLenByteArrayOffsetIndex{&col.fixedLenByteArrayPage}, nil -} - -func (col *fixedLenByteArrayColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *fixedLenByteArrayColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *fixedLenByteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *fixedLenByteArrayColumnBuffer) Page() Page { return &col.fixedLenByteArrayPage } - -func (col *fixedLenByteArrayColumnBuffer) Reset() { col.data = col.data[:0] } - -func (col *fixedLenByteArrayColumnBuffer) Cap() int { return cap(col.data) / col.size } - -func (col *fixedLenByteArrayColumnBuffer) Len() int { return len(col.data) / col.size } - -func (col *fixedLenByteArrayColumnBuffer) Less(i, j int) bool { - return bytes.Compare(col.index(i), col.index(j)) < 0 -} - -func (col *fixedLenByteArrayColumnBuffer) Swap(i, j int) { - t, u, v := col.tmp[:col.size], col.index(i), col.index(j) - copy(t, u) - copy(u, v) - copy(v, t) -} - -func (col *fixedLenByteArrayColumnBuffer) index(i int) []byte { - j := (i + 0) * col.size - k := (i + 1) * col.size - return col.data[j:k:k] -} - -func (col *fixedLenByteArrayColumnBuffer) Write(b []byte) (int, error) { - n, err := col.WriteFixedLenByteArrays(b) - return n * col.size, err -} - -func (col *fixedLenByteArrayColumnBuffer) WriteFixedLenByteArrays(values []byte) (int, error) { - if len(values) == 0 { - return 0, nil - } - d, m := len(values)/col.size, len(values)%col.size - if d == 0 || m != 0 { - return 0, fmt.Errorf("cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d", col.size, len(values)) - } - col.data = append(col.data, values...) - return d, nil -} - -func (col *fixedLenByteArrayColumnBuffer) WriteValues(values []Value) (int, error) { - for i, v := range values { - if n := len(v.byteArray()); n != col.size { - return i, fmt.Errorf("cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d", col.size, n) - } - col.data = append(col.data, v.byteArray()...) - } - return len(values), nil -} - -func (col *fixedLenByteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - n := col.size * rows.Len() - i := len(col.data) - j := len(col.data) + n - - if cap(col.data) < j { - col.data = append(make([]byte, 0, max(i+n, 2*cap(col.data))), col.data...) - } - - col.data = col.data[:j] - newData := col.data[i:] - - for i := range rows.Len() { - p := rows.Index(i) - copy(newData[i*col.size:], unsafe.Slice((*byte)(p), col.size)) - } -} - -func (col *fixedLenByteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) * col.size - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.data)/col.size)) - case i >= len(col.data): - return 0, io.EOF - default: - for n < len(values) && i < len(col.data) { - values[n] = col.makeValueBytes(col.data[i : i+col.size]) - n++ - i += col.size - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type uint32ColumnBuffer struct{ uint32Page } - -func newUint32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint32ColumnBuffer { - return &uint32ColumnBuffer{ - uint32Page: uint32Page{ - typ: typ, - values: make([]uint32, 0, numValues), - columnIndex: ^columnIndex, - }, - } -} - -func (col *uint32ColumnBuffer) Clone() ColumnBuffer { - return &uint32ColumnBuffer{ - uint32Page: uint32Page{ - typ: col.typ, - values: slices.Clone(col.values), - columnIndex: col.columnIndex, - }, - } -} - -func (col *uint32ColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return uint32ColumnIndex{&col.uint32Page}, nil -} - -func (col *uint32ColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return uint32OffsetIndex{&col.uint32Page}, nil -} - -func (col *uint32ColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *uint32ColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *uint32ColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *uint32ColumnBuffer) Page() Page { return &col.uint32Page } - -func (col *uint32ColumnBuffer) Reset() { col.values = col.values[:0] } - -func (col *uint32ColumnBuffer) Cap() int { return cap(col.values) } - -func (col *uint32ColumnBuffer) Len() int { return len(col.values) } - -func (col *uint32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } - -func (col *uint32ColumnBuffer) Swap(i, j int) { - col.values[i], col.values[j] = col.values[j], col.values[i] -} - -func (col *uint32ColumnBuffer) Write(b []byte) (int, error) { - if (len(b) % 4) != 0 { - return 0, fmt.Errorf("cannot write INT32 values from input of size %d", len(b)) - } - col.values = append(col.values, unsafecast.Slice[uint32](b)...) - return len(b), nil -} - -func (col *uint32ColumnBuffer) WriteUint32s(values []uint32) (int, error) { - col.values = append(col.values, values...) - return len(values), nil -} - -func (col *uint32ColumnBuffer) WriteValues(values []Value) (int, error) { - col.writeValues(makeArrayValue(values, offsetOfU32), columnLevels{}) - return len(values), nil -} - -func (col *uint32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - if n := len(col.values) + rows.Len(); n > cap(col.values) { - col.values = append(make([]uint32, 0, max(n, 2*cap(col.values))), col.values...) - } - n := len(col.values) - col.values = col.values[:n+rows.Len()] - sparse.GatherUint32(col.values[n:], rows.Uint32Array()) -} - -func (col *uint32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) - case i >= len(col.values): - return 0, io.EOF - default: - for n < len(values) && i < len(col.values) { - values[n] = col.makeValue(col.values[i]) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type uint64ColumnBuffer struct{ uint64Page } - -func newUint64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint64ColumnBuffer { - return &uint64ColumnBuffer{ - uint64Page: uint64Page{ - typ: typ, - values: make([]uint64, 0, numValues), - columnIndex: ^columnIndex, - }, - } -} - -func (col *uint64ColumnBuffer) Clone() ColumnBuffer { - return &uint64ColumnBuffer{ - uint64Page: uint64Page{ - typ: col.typ, - values: slices.Clone(col.values), - columnIndex: col.columnIndex, - }, - } -} - -func (col *uint64ColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return uint64ColumnIndex{&col.uint64Page}, nil -} - -func (col *uint64ColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return uint64OffsetIndex{&col.uint64Page}, nil -} - -func (col *uint64ColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *uint64ColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *uint64ColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *uint64ColumnBuffer) Page() Page { return &col.uint64Page } - -func (col *uint64ColumnBuffer) Reset() { col.values = col.values[:0] } - -func (col *uint64ColumnBuffer) Cap() int { return cap(col.values) } - -func (col *uint64ColumnBuffer) Len() int { return len(col.values) } - -func (col *uint64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } - -func (col *uint64ColumnBuffer) Swap(i, j int) { - col.values[i], col.values[j] = col.values[j], col.values[i] -} - -func (col *uint64ColumnBuffer) Write(b []byte) (int, error) { - if (len(b) % 8) != 0 { - return 0, fmt.Errorf("cannot write INT64 values from input of size %d", len(b)) - } - col.values = append(col.values, unsafecast.Slice[uint64](b)...) - return len(b), nil -} - -func (col *uint64ColumnBuffer) WriteUint64s(values []uint64) (int, error) { - col.values = append(col.values, values...) - return len(values), nil -} - -func (col *uint64ColumnBuffer) WriteValues(values []Value) (int, error) { - col.writeValues(makeArrayValue(values, offsetOfU64), columnLevels{}) - return len(values), nil -} - -func (col *uint64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - if n := len(col.values) + rows.Len(); n > cap(col.values) { - col.values = append(make([]uint64, 0, max(n, 2*cap(col.values))), col.values...) - } - n := len(col.values) - col.values = col.values[:n+rows.Len()] - sparse.GatherUint64(col.values[n:], rows.Uint64Array()) -} - -func (col *uint64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) - case i >= len(col.values): - return 0, io.EOF - default: - for n < len(values) && i < len(col.values) { - values[n] = col.makeValue(col.values[i]) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - -type be128ColumnBuffer struct{ be128Page } - -func newBE128ColumnBuffer(typ Type, columnIndex int16, numValues int32) *be128ColumnBuffer { - return &be128ColumnBuffer{ - be128Page: be128Page{ - typ: typ, - values: make([][16]byte, 0, numValues), - columnIndex: ^columnIndex, - }, - } -} - -func (col *be128ColumnBuffer) Clone() ColumnBuffer { - return &be128ColumnBuffer{ - be128Page: be128Page{ - typ: col.typ, - values: slices.Clone(col.values), - columnIndex: col.columnIndex, - }, - } -} - -func (col *be128ColumnBuffer) ColumnIndex() (ColumnIndex, error) { - return be128ColumnIndex{&col.be128Page}, nil -} - -func (col *be128ColumnBuffer) OffsetIndex() (OffsetIndex, error) { - return be128OffsetIndex{&col.be128Page}, nil -} - -func (col *be128ColumnBuffer) BloomFilter() BloomFilter { return nil } - -func (col *be128ColumnBuffer) Dictionary() Dictionary { return nil } - -func (col *be128ColumnBuffer) Pages() Pages { return onePage(col.Page()) } - -func (col *be128ColumnBuffer) Page() Page { return &col.be128Page } - -func (col *be128ColumnBuffer) Reset() { col.values = col.values[:0] } - -func (col *be128ColumnBuffer) Cap() int { return cap(col.values) } - -func (col *be128ColumnBuffer) Len() int { return len(col.values) } - -func (col *be128ColumnBuffer) Less(i, j int) bool { - return lessBE128(&col.values[i], &col.values[j]) -} - -func (col *be128ColumnBuffer) Swap(i, j int) { - col.values[i], col.values[j] = col.values[j], col.values[i] -} - -func (col *be128ColumnBuffer) WriteValues(values []Value) (int, error) { - if n := len(col.values) + len(values); n > cap(col.values) { - col.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...) - } - n := len(col.values) - col.values = col.values[:n+len(values)] - newValues := col.values[n:] - for i, v := range values { - copy(newValues[i][:], v.byteArray()) - } - return len(values), nil -} - -func (col *be128ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { - if n := len(col.values) + rows.Len(); n > cap(col.values) { - col.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...) - } - n := len(col.values) - col.values = col.values[:n+rows.Len()] - sparse.GatherUint128(col.values[n:], rows.Uint128Array()) -} - -func (col *be128ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { - i := int(offset) - switch { - case i < 0: - return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) - case i >= len(col.values): - return 0, io.EOF - default: - for n < len(values) && i < len(col.values) { - values[n] = col.makeValue(&col.values[i]) - n++ - i++ - } - if n < len(values) { - err = io.EOF - } - return n, err - } -} - var ( - _ sort.Interface = (ColumnBuffer)(nil) - _ io.Writer = (*byteArrayColumnBuffer)(nil) - _ io.Writer = (*fixedLenByteArrayColumnBuffer)(nil) + _ ColumnBuffer = (*optionalColumnBuffer)(nil) + _ ColumnBuffer = (*repeatedColumnBuffer)(nil) + _ ColumnBuffer = (*booleanColumnBuffer)(nil) + _ ColumnBuffer = (*int32ColumnBuffer)(nil) + _ ColumnBuffer = (*int64ColumnBuffer)(nil) + _ ColumnBuffer = (*int96ColumnBuffer)(nil) + _ ColumnBuffer = (*floatColumnBuffer)(nil) + _ ColumnBuffer = (*doubleColumnBuffer)(nil) + _ ColumnBuffer = (*byteArrayColumnBuffer)(nil) + _ ColumnBuffer = (*fixedLenByteArrayColumnBuffer)(nil) + _ ColumnBuffer = (*uint32ColumnBuffer)(nil) + _ ColumnBuffer = (*uint64ColumnBuffer)(nil) + _ ColumnBuffer = (*be128ColumnBuffer)(nil) ) - -// writeRowsFunc is the type of functions that apply rows to a set of column -// buffers. -// -// - columns is the array of column buffer where the rows are written. -// -// - rows is the array of Go values to write to the column buffers. -// -// - levels is used to track the column index, repetition and definition levels -// of values when writing optional or repeated columns. -type writeRowsFunc func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error - -// writeRowsFuncOf generates a writeRowsFunc function for the given Go type and -// parquet schema. The column path indicates the column that the function is -// being generated for in the parquet schema. -func writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - if leaf, exists := schema.Lookup(path...); exists && leaf.Node.Type().LogicalType() != nil && leaf.Node.Type().LogicalType().Json != nil { - return writeRowsFuncOfJSON(t, schema, path) - } - - switch t { - case reflect.TypeOf(deprecated.Int96{}): - return writeRowsFuncOfRequired(t, schema, path) - case reflect.TypeOf(time.Time{}): - return writeRowsFuncOfTime(t, schema, path) - } - - switch t.Kind() { - case reflect.Bool, - reflect.Int, - reflect.Uint, - reflect.Int32, - reflect.Uint32, - reflect.Int64, - reflect.Uint64, - reflect.Float32, - reflect.Float64, - reflect.String: - return writeRowsFuncOfRequired(t, schema, path) - - case reflect.Slice: - if t.Elem().Kind() == reflect.Uint8 { - return writeRowsFuncOfRequired(t, schema, path) - } else { - return writeRowsFuncOfSlice(t, schema, path) - } - - case reflect.Array: - if t.Elem().Kind() == reflect.Uint8 { - return writeRowsFuncOfArray(t, schema, path) - } - - case reflect.Pointer: - return writeRowsFuncOfPointer(t, schema, path) - - case reflect.Struct: - return writeRowsFuncOfStruct(t, schema, path) - - case reflect.Map: - return writeRowsFuncOfMap(t, schema, path) - } - - panic("cannot convert Go values of type " + typeNameOf(t) + " to parquet value") -} - -func writeRowsFuncOfRequired(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - column := schema.lazyLoadState().mapping.lookup(path) - columnIndex := column.columnIndex - if columnIndex < 0 { - panic("parquet: column not found: " + path.String()) - } - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - columns[columnIndex].writeValues(rows, levels) - return nil - } -} - -func writeRowsFuncOfOptional(t reflect.Type, schema *Schema, path columnPath, writeRows writeRowsFunc) writeRowsFunc { - if t.Kind() == reflect.Slice { // assume nested list - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - return writeRows(columns, rows, levels) - } - levels.definitionLevel++ - return writeRows(columns, rows, levels) - } - } - nullIndex := nullIndexFuncOf(t) - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - return writeRows(columns, rows, levels) - } - - nulls := acquireBitmap(rows.Len()) - defer releaseBitmap(nulls) - nullIndex(nulls.bits, rows) - - nullLevels := levels - levels.definitionLevel++ - // In this function, we are dealing with optional values which are - // neither pointers nor slices; for example, a int32 field marked - // "optional" in its parent struct. - // - // We need to find zero values, which should be represented as nulls - // in the parquet column. In order to minimize the calls to writeRows - // and maximize throughput, we use the nullIndex and nonNullIndex - // functions, which are type-specific implementations of the algorithm. - // - // Sections of the input that are contiguous nulls or non-nulls can be - // sent to a single call to writeRows to be written to the underlying - // buffer since they share the same definition level. - // - // This optimization is defeated by inputs alternating null and non-null - // sequences of single values, we do not expect this condition to be a - // common case. - for i := 0; i < rows.Len(); { - j := 0 - x := i / 64 - y := i % 64 - - if y != 0 { - if b := nulls.bits[x] >> uint(y); b == 0 { - x++ - y = 0 - } else { - y += bits.TrailingZeros64(b) - goto writeNulls - } - } - - for x < len(nulls.bits) && nulls.bits[x] == 0 { - x++ - } - - if x < len(nulls.bits) { - y = bits.TrailingZeros64(nulls.bits[x]) % 64 - } - - writeNulls: - if j = x*64 + y; j > rows.Len() { - j = rows.Len() - } - - if i < j { - if err := writeRows(columns, rows.Slice(i, j), nullLevels); err != nil { - return err - } - i = j - } - - if y != 0 { - if b := nulls.bits[x] >> uint(y); b == (1< rows.Len() { - j = rows.Len() - } - - if i < j { - if err := writeRows(columns, rows.Slice(i, j), levels); err != nil { - return err - } - i = j - } - } - - return nil - } -} - -func writeRowsFuncOfArray(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - column := schema.lazyLoadState().mapping.lookup(path) - arrayLen := t.Len() - columnLen := column.node.Type().Length() - if arrayLen != columnLen { - panic(fmt.Sprintf("cannot convert Go values of type "+typeNameOf(t)+" to FIXED_LEN_BYTE_ARRAY(%d)", columnLen)) - } - return writeRowsFuncOfRequired(t, schema, path) -} - -func writeRowsFuncOfPointer(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - elemType := t.Elem() - elemSize := uintptr(elemType.Size()) - writeRows := writeRowsFuncOf(elemType, schema, path) - - if len(path) == 0 { - // This code path is taken when generating a writeRowsFunc for a pointer - // type. In this case, we do not need to increase the definition level - // since we are not deailng with an optional field but a pointer to the - // row type. - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - return writeRows(columns, rows, levels) - } - - for i := range rows.Len() { - p := *(*unsafe.Pointer)(rows.Index(i)) - a := sparse.Array{} - if p != nil { - a = makeArray(p, 1, elemSize) - } - if err := writeRows(columns, a, levels); err != nil { - return err - } - } - - return nil - } - } - - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - return writeRows(columns, rows, levels) - } - - for i := range rows.Len() { - p := *(*unsafe.Pointer)(rows.Index(i)) - a := sparse.Array{} - elemLevels := levels - if p != nil { - a = makeArray(p, 1, elemSize) - elemLevels.definitionLevel++ - } - if err := writeRows(columns, a, elemLevels); err != nil { - return err - } - } - - return nil - } -} - -func writeRowsFuncOfSlice(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - elemType := t.Elem() - elemSize := uintptr(elemType.Size()) - writeRows := writeRowsFuncOf(elemType, schema, path) - - // When the element is a pointer type, the writeRows function will be an - // instance returned by writeRowsFuncOfPointer, which handles incrementing - // the definition level if the pointer value is not nil. - definitionLevelIncrement := byte(0) - if elemType.Kind() != reflect.Ptr { - definitionLevelIncrement = 1 - } - - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - return writeRows(columns, rows, levels) - } - - levels.repetitionDepth++ - - for i := range rows.Len() { - p := (*sliceHeader)(rows.Index(i)) - a := makeArray(p.base, p.len, elemSize) - b := sparse.Array{} - - elemLevels := levels - if a.Len() > 0 { - b = a.Slice(0, 1) - elemLevels.definitionLevel += definitionLevelIncrement - } - - if err := writeRows(columns, b, elemLevels); err != nil { - return err - } - - if a.Len() > 1 { - elemLevels.repetitionLevel = elemLevels.repetitionDepth - - if err := writeRows(columns, a.Slice(1, a.Len()), elemLevels); err != nil { - return err - } - } - } - - return nil - } -} - -func writeRowsFuncOfStruct(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - type column struct { - offset uintptr - writeRows writeRowsFunc - } - - fields := structFieldsOf(t) - columns := make([]column, len(fields)) - - for i, f := range fields { - list, optional := false, false - columnPath := path.append(f.Name) - forEachStructTagOption(f, func(_ reflect.Type, option, _ string) { - switch option { - case "list": - list = true - columnPath = columnPath.append("list", "element") - case "optional": - optional = true - } - }) - - writeRows := writeRowsFuncOf(f.Type, schema, columnPath) - if optional { - kind := f.Type.Kind() - switch { - case kind == reflect.Pointer: - case kind == reflect.Slice && !list: - default: - writeRows = writeRowsFuncOfOptional(f.Type, schema, columnPath, writeRows) - } - } - - columns[i] = column{ - offset: f.Offset, - writeRows: writeRows, - } - } - - return func(buffers []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - for _, column := range columns { - if err := column.writeRows(buffers, rows, levels); err != nil { - return err - } - } - } else { - for _, column := range columns { - if err := column.writeRows(buffers, rows.Offset(column.offset), levels); err != nil { - return err - } - } - } - return nil - } -} - -func writeRowsFuncOfMap(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - keyPath := path.append("key_value", "key") - keyType := t.Key() - keySize := uintptr(keyType.Size()) - writeKeys := writeRowsFuncOf(keyType, schema, keyPath) - - valuePath := path.append("key_value", "value") - valueType := t.Elem() - valueSize := uintptr(valueType.Size()) - writeValues := writeRowsFuncOf(valueType, schema, valuePath) - - writeKeyValues := func(columns []ColumnBuffer, keys, values sparse.Array, levels columnLevels) error { - if err := writeKeys(columns, keys, levels); err != nil { - return err - } - if err := writeValues(columns, values, levels); err != nil { - return err - } - return nil - } - - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - return writeKeyValues(columns, rows, rows, levels) - } - - levels.repetitionDepth++ - mapKey := reflect.Value{} - mapValue := reflect.Value{} - compareKeys := compareFuncOf(keyType) - if compareKeys == nil { - mapKey = reflect.New(keyType).Elem() - mapValue = reflect.New(valueType).Elem() - } - - for i := range rows.Len() { - m := reflect.NewAt(t, rows.Index(i)).Elem() - - if m.Len() == 0 { - empty := sparse.Array{} - if err := writeKeyValues(columns, empty, empty, levels); err != nil { - return err - } - continue - } - - elemLevels := levels - elemLevels.definitionLevel++ - - if compareKeys != nil { - keys := m.MapKeys() - slices.SortFunc(keys, compareKeys) - - for _, key := range keys { - value := m.MapIndex(key) - - k := makeArray(reflectValueData(key), 1, keySize) - v := makeArray(reflectValueData(value), 1, valueSize) - - if err := writeKeyValues(columns, k, v, elemLevels); err != nil { - return err - } - - elemLevels.repetitionLevel = elemLevels.repetitionDepth - } - } else { - for it := m.MapRange(); it.Next(); { - mapKey.SetIterKey(it) - mapValue.SetIterValue(it) - - k := makeArray(reflectValueData(mapKey), 1, keySize) - v := makeArray(reflectValueData(mapValue), 1, valueSize) - - if err := writeKeyValues(columns, k, v, elemLevels); err != nil { - return err - } - - elemLevels.repetitionLevel = elemLevels.repetitionDepth - } - } - } - - return nil - } -} - -func compareFuncOf(t reflect.Type) func(reflect.Value, reflect.Value) int { - switch t.Kind() { - case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: - return func(a, b reflect.Value) int { - return cmp.Compare(a.Int(), b.Int()) - } - case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: - return func(a, b reflect.Value) int { - return cmp.Compare(a.Uint(), b.Uint()) - } - case reflect.Float32, reflect.Float64: - return func(a, b reflect.Value) int { - return cmp.Compare(a.Float(), b.Float()) - } - case reflect.String: - return func(a, b reflect.Value) int { - return cmp.Compare(a.String(), b.String()) - } - default: - return nil - } -} - -func writeRowsFuncOfJSON(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - // If this is a string or a byte array write directly. - switch t.Kind() { - case reflect.String: - return writeRowsFuncOfRequired(t, schema, path) - case reflect.Slice: - if t.Elem().Kind() == reflect.Uint8 { - return writeRowsFuncOfRequired(t, schema, path) - } - } - - // Otherwise handle with a json.Marshal - asStrT := reflect.TypeOf(string("")) - writer := writeRowsFuncOfRequired(asStrT, schema, path) - - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - return writer(columns, rows, levels) - } - for i := range rows.Len() { - val := reflect.NewAt(t, rows.Index(i)) - asI := val.Interface() - - b, err := json.Marshal(asI) - if err != nil { - return err - } - - asStr := string(b) - a := sparse.MakeStringArray([]string{asStr}) - if err := writer(columns, a.UnsafeArray(), levels); err != nil { - return err - } - } - return nil - } -} - -func writeRowsFuncOfTime(_ reflect.Type, schema *Schema, path columnPath) writeRowsFunc { - t := reflect.TypeOf(int64(0)) - elemSize := uintptr(t.Size()) - writeRows := writeRowsFuncOf(t, schema, path) - - col, _ := schema.Lookup(path...) - unit := Nanosecond.TimeUnit() - lt := col.Node.Type().LogicalType() - if lt != nil && lt.Timestamp != nil { - unit = lt.Timestamp.Unit - } - - return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { - if rows.Len() == 0 { - return writeRows(columns, rows, levels) - } - - times := rows.TimeArray() - for i := range times.Len() { - t := times.Index(i) - var val int64 - switch { - case unit.Millis != nil: - val = t.UnixMilli() - case unit.Micros != nil: - val = t.UnixMicro() - default: - val = t.UnixNano() - } - - a := makeArray(reflectValueData(reflect.ValueOf(val)), 1, elemSize) - if err := writeRows(columns, a, levels); err != nil { - return err - } - } - - return nil - } -} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_amd64.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_amd64.go index 45717269965..8f7e91f654e 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_buffer_amd64.go +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_amd64.go @@ -3,8 +3,8 @@ package parquet import ( + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/internal/bytealg" - "github.com/parquet-go/parquet-go/internal/unsafecast" "github.com/parquet-go/parquet-go/sparse" "golang.org/x/sys/cpu" ) diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_be128.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_be128.go new file mode 100644 index 00000000000..cf4169639ec --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_be128.go @@ -0,0 +1,102 @@ +package parquet + +import ( + "io" + "slices" + + "github.com/parquet-go/parquet-go/sparse" +) + +type be128ColumnBuffer struct{ be128Page } + +func newBE128ColumnBuffer(typ Type, columnIndex int16, numValues int32) *be128ColumnBuffer { + return &be128ColumnBuffer{ + be128Page: be128Page{ + typ: typ, + values: make([][16]byte, 0, numValues), + columnIndex: ^columnIndex, + }, + } +} + +func (col *be128ColumnBuffer) Clone() ColumnBuffer { + return &be128ColumnBuffer{ + be128Page: be128Page{ + typ: col.typ, + values: slices.Clone(col.values), + columnIndex: col.columnIndex, + }, + } +} + +func (col *be128ColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return be128ColumnIndex{&col.be128Page}, nil +} + +func (col *be128ColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return be128OffsetIndex{&col.be128Page}, nil +} + +func (col *be128ColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *be128ColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *be128ColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *be128ColumnBuffer) Page() Page { return &col.be128Page } + +func (col *be128ColumnBuffer) Reset() { col.values = col.values[:0] } + +func (col *be128ColumnBuffer) Cap() int { return cap(col.values) } + +func (col *be128ColumnBuffer) Len() int { return len(col.values) } + +func (col *be128ColumnBuffer) Less(i, j int) bool { + return lessBE128(&col.values[i], &col.values[j]) +} + +func (col *be128ColumnBuffer) Swap(i, j int) { + col.values[i], col.values[j] = col.values[j], col.values[i] +} + +func (col *be128ColumnBuffer) WriteValues(values []Value) (int, error) { + if n := len(col.values) + len(values); n > cap(col.values) { + col.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...) + } + n := len(col.values) + col.values = col.values[:n+len(values)] + newValues := col.values[n:] + for i, v := range values { + copy(newValues[i][:], v.byteArray()) + } + return len(values), nil +} + +func (col *be128ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + if n := len(col.values) + rows.Len(); n > cap(col.values) { + col.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...) + } + n := len(col.values) + col.values = col.values[:n+rows.Len()] + sparse.GatherUint128(col.values[n:], rows.Uint128Array()) +} + +func (col *be128ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) + case i >= len(col.values): + return 0, io.EOF + default: + for n < len(values) && i < len(col.values) { + values[n] = col.makeValue(&col.values[i]) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_boolean.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_boolean.go new file mode 100644 index 00000000000..9cfc24d73b9 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_boolean.go @@ -0,0 +1,172 @@ +package parquet + +import ( + "io" + "slices" + + "github.com/parquet-go/bitpack" + "github.com/parquet-go/parquet-go/sparse" +) + +type booleanColumnBuffer struct{ booleanPage } + +func newBooleanColumnBuffer(typ Type, columnIndex int16, numValues int32) *booleanColumnBuffer { + // Boolean values are bit-packed, we can fit up to 8 values per byte. + bufferSize := (numValues + 7) / 8 + return &booleanColumnBuffer{ + booleanPage: booleanPage{ + typ: typ, + bits: make([]byte, 0, bufferSize), + columnIndex: ^columnIndex, + }, + } +} + +func (col *booleanColumnBuffer) Clone() ColumnBuffer { + return &booleanColumnBuffer{ + booleanPage: booleanPage{ + typ: col.typ, + bits: slices.Clone(col.bits), + offset: col.offset, + numValues: col.numValues, + columnIndex: col.columnIndex, + }, + } +} + +func (col *booleanColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return booleanColumnIndex{&col.booleanPage}, nil +} + +func (col *booleanColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return booleanOffsetIndex{&col.booleanPage}, nil +} + +func (col *booleanColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *booleanColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *booleanColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *booleanColumnBuffer) Page() Page { return &col.booleanPage } + +func (col *booleanColumnBuffer) Reset() { + col.bits = col.bits[:0] + col.offset = 0 + col.numValues = 0 +} + +func (col *booleanColumnBuffer) Cap() int { return 8 * cap(col.bits) } + +func (col *booleanColumnBuffer) Len() int { return int(col.numValues) } + +func (col *booleanColumnBuffer) Less(i, j int) bool { + a := col.valueAt(i) + b := col.valueAt(j) + return a != b && !a +} + +func (col *booleanColumnBuffer) valueAt(i int) bool { + j := uint32(i) / 8 + k := uint32(i) % 8 + return ((col.bits[j] >> k) & 1) != 0 +} + +func (col *booleanColumnBuffer) setValueAt(i int, v bool) { + // `offset` is always zero in the page of a column buffer + j := uint32(i) / 8 + k := uint32(i) % 8 + x := byte(0) + if v { + x = 1 + } + col.bits[j] = (col.bits[j] & ^(1 << k)) | (x << k) +} + +func (col *booleanColumnBuffer) Swap(i, j int) { + a := col.valueAt(i) + b := col.valueAt(j) + col.setValueAt(i, b) + col.setValueAt(j, a) +} + +func (col *booleanColumnBuffer) WriteBooleans(values []bool) (int, error) { + col.writeValues(sparse.MakeBoolArray(values).UnsafeArray(), columnLevels{}) + return len(values), nil +} + +func (col *booleanColumnBuffer) WriteValues(values []Value) (int, error) { + col.writeValues(makeArrayValue(values, offsetOfBool), columnLevels{}) + return len(values), nil +} + +func (col *booleanColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + numBytes := bitpack.ByteCount(uint(col.numValues) + uint(rows.Len())) + if cap(col.bits) < numBytes { + col.bits = append(make([]byte, 0, max(numBytes, 2*cap(col.bits))), col.bits...) + } + col.bits = col.bits[:numBytes] + i := 0 + r := 8 - (int(col.numValues) % 8) + bytes := rows.Uint8Array() + + if r <= bytes.Len() { + // First we attempt to write enough bits to align the number of values + // in the column buffer on 8 bytes. After this step the next bit should + // be written at the zero'th index of a byte of the buffer. + if r < 8 { + var b byte + for i < r { + v := bytes.Index(i) + b |= (v & 1) << uint(i) + i++ + } + x := uint(col.numValues) / 8 + y := uint(col.numValues) % 8 + col.bits[x] = (b << y) | (col.bits[x] & ^(0xFF << y)) + col.numValues += int32(i) + } + + if n := ((bytes.Len() - i) / 8) * 8; n > 0 { + // At this stage, we know that that we have at least 8 bits to write + // and the bits will be aligned on the address of a byte in the + // output buffer. We can work on 8 values per loop iteration, + // packing them into a single byte and writing it to the output + // buffer. This effectively reduces by 87.5% the number of memory + // stores that the program needs to perform to generate the values. + i += sparse.GatherBits(col.bits[col.numValues/8:], bytes.Slice(i, i+n)) + col.numValues += int32(n) + } + } + + for i < bytes.Len() { + x := uint(col.numValues) / 8 + y := uint(col.numValues) % 8 + b := bytes.Index(i) + col.bits[x] = ((b & 1) << y) | (col.bits[x] & ^(1 << y)) + col.numValues++ + i++ + } + + col.bits = col.bits[:bitpack.ByteCount(uint(col.numValues))] +} + +func (col *booleanColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(col.numValues)) + case i >= int(col.numValues): + return 0, io.EOF + default: + for n < len(values) && i < int(col.numValues) { + values[n] = col.makeValue(col.valueAt(i)) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_byte_array.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_byte_array.go new file mode 100644 index 00000000000..0fc2d503c46 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_byte_array.go @@ -0,0 +1,176 @@ +package parquet + +import ( + "bytes" + "io" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/encoding/plain" + "github.com/parquet-go/parquet-go/sparse" +) + +type byteArrayColumnBuffer struct { + byteArrayPage + lengths []uint32 + scratch []byte +} + +func newByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *byteArrayColumnBuffer { + return &byteArrayColumnBuffer{ + byteArrayPage: byteArrayPage{ + typ: typ, + values: make([]byte, 0, typ.EstimateSize(int(numValues))), + offsets: make([]uint32, 0, numValues+1), + columnIndex: ^columnIndex, + }, + lengths: make([]uint32, 0, numValues), + } +} + +func (col *byteArrayColumnBuffer) Clone() ColumnBuffer { + return &byteArrayColumnBuffer{ + byteArrayPage: byteArrayPage{ + typ: col.typ, + values: col.cloneValues(), + offsets: col.cloneOffsets(), + columnIndex: col.columnIndex, + }, + lengths: col.cloneLengths(), + } +} + +func (col *byteArrayColumnBuffer) cloneLengths() []uint32 { + lengths := make([]uint32, len(col.lengths)) + copy(lengths, col.lengths) + return lengths +} + +func (col *byteArrayColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return byteArrayColumnIndex{col.page()}, nil +} + +func (col *byteArrayColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return byteArrayOffsetIndex{col.page()}, nil +} + +func (col *byteArrayColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *byteArrayColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *byteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *byteArrayColumnBuffer) page() *byteArrayPage { + if len(col.lengths) > 0 && orderOfUint32(col.offsets) < 1 { // unordered? + if cap(col.scratch) < len(col.values) { + col.scratch = make([]byte, 0, cap(col.values)) + } else { + col.scratch = col.scratch[:0] + } + + for i := range col.lengths { + n := len(col.scratch) + col.scratch = append(col.scratch, col.index(i)...) + col.offsets[i] = uint32(n) + } + + col.values, col.scratch = col.scratch, col.values + } + col.offsets = append(col.offsets[:len(col.lengths)], uint32(len(col.values))) + return &col.byteArrayPage +} + +func (col *byteArrayColumnBuffer) Page() Page { + return col.page() +} + +func (col *byteArrayColumnBuffer) Reset() { + col.values = col.values[:0] + col.offsets = col.offsets[:0] + col.lengths = col.lengths[:0] +} + +func (col *byteArrayColumnBuffer) NumRows() int64 { return int64(col.Len()) } + +func (col *byteArrayColumnBuffer) NumValues() int64 { return int64(col.Len()) } + +func (col *byteArrayColumnBuffer) Cap() int { return cap(col.lengths) } + +func (col *byteArrayColumnBuffer) Len() int { return len(col.lengths) } + +func (col *byteArrayColumnBuffer) Less(i, j int) bool { + return bytes.Compare(col.index(i), col.index(j)) < 0 +} + +func (col *byteArrayColumnBuffer) Swap(i, j int) { + col.offsets[i], col.offsets[j] = col.offsets[j], col.offsets[i] + col.lengths[i], col.lengths[j] = col.lengths[j], col.lengths[i] +} + +func (col *byteArrayColumnBuffer) Write(b []byte) (int, error) { + _, n, err := col.writeByteArrays(b) + return n, err +} + +func (col *byteArrayColumnBuffer) WriteByteArrays(values []byte) (int, error) { + n, _, err := col.writeByteArrays(values) + return n, err +} + +func (col *byteArrayColumnBuffer) writeByteArrays(values []byte) (count, bytes int, err error) { + baseCount := len(col.lengths) + baseBytes := len(col.values) + (plain.ByteArrayLengthSize * len(col.lengths)) + + err = plain.RangeByteArray(values, func(value []byte) error { + col.append(unsafecast.String(value)) + return nil + }) + + count = len(col.lengths) - baseCount + bytes = (len(col.values) - baseBytes) + (plain.ByteArrayLengthSize * count) + return count, bytes, err +} + +func (col *byteArrayColumnBuffer) WriteValues(values []Value) (int, error) { + col.writeValues(makeArrayValue(values, offsetOfPtr), columnLevels{}) + return len(values), nil +} + +func (col *byteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + for i := range rows.Len() { + p := rows.Index(i) + col.append(*(*string)(p)) + } +} + +func (col *byteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.lengths))) + case i >= len(col.lengths): + return 0, io.EOF + default: + for n < len(values) && i < len(col.lengths) { + values[n] = col.makeValueBytes(col.index(i)) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} + +func (col *byteArrayColumnBuffer) append(value string) { + col.offsets = append(col.offsets, uint32(len(col.values))) + col.lengths = append(col.lengths, uint32(len(value))) + col.values = append(col.values, value...) +} + +func (col *byteArrayColumnBuffer) index(i int) []byte { + offset := col.offsets[i] + length := col.lengths[i] + end := offset + length + return col.values[offset:end:end] +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_double.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_double.go new file mode 100644 index 00000000000..ccb7e9ae3f6 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_double.go @@ -0,0 +1,107 @@ +package parquet + +import ( + "fmt" + "io" + "slices" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/sparse" +) + +type doubleColumnBuffer struct{ doublePage } + +func newDoubleColumnBuffer(typ Type, columnIndex int16, numValues int32) *doubleColumnBuffer { + return &doubleColumnBuffer{ + doublePage: doublePage{ + typ: typ, + values: make([]float64, 0, numValues), + columnIndex: ^columnIndex, + }, + } +} + +func (col *doubleColumnBuffer) Clone() ColumnBuffer { + return &doubleColumnBuffer{ + doublePage: doublePage{ + typ: col.typ, + values: slices.Clone(col.values), + columnIndex: col.columnIndex, + }, + } +} + +func (col *doubleColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return doubleColumnIndex{&col.doublePage}, nil +} + +func (col *doubleColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return doubleOffsetIndex{&col.doublePage}, nil +} + +func (col *doubleColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *doubleColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *doubleColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *doubleColumnBuffer) Page() Page { return &col.doublePage } + +func (col *doubleColumnBuffer) Reset() { col.values = col.values[:0] } + +func (col *doubleColumnBuffer) Cap() int { return cap(col.values) } + +func (col *doubleColumnBuffer) Len() int { return len(col.values) } + +func (col *doubleColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } + +func (col *doubleColumnBuffer) Swap(i, j int) { + col.values[i], col.values[j] = col.values[j], col.values[i] +} + +func (col *doubleColumnBuffer) Write(b []byte) (int, error) { + if (len(b) % 8) != 0 { + return 0, fmt.Errorf("cannot write DOUBLE values from input of size %d", len(b)) + } + col.values = append(col.values, unsafecast.Slice[float64](b)...) + return len(b), nil +} + +func (col *doubleColumnBuffer) WriteDoubles(values []float64) (int, error) { + col.values = append(col.values, values...) + return len(values), nil +} + +func (col *doubleColumnBuffer) WriteValues(values []Value) (int, error) { + col.writeValues(makeArrayValue(values, offsetOfU64), columnLevels{}) + return len(values), nil +} + +func (col *doubleColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + if n := len(col.values) + rows.Len(); n > cap(col.values) { + col.values = append(make([]float64, 0, max(n, 2*cap(col.values))), col.values...) + } + n := len(col.values) + col.values = col.values[:n+rows.Len()] + sparse.GatherFloat64(col.values[n:], rows.Float64Array()) +} + +func (col *doubleColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) + case i >= len(col.values): + return 0, io.EOF + default: + for n < len(values) && i < len(col.values) { + values[n] = col.makeValue(col.values[i]) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_fixed_len_byte_array.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_fixed_len_byte_array.go new file mode 100644 index 00000000000..dfb1bd0e80a --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_fixed_len_byte_array.go @@ -0,0 +1,145 @@ +package parquet + +import ( + "bytes" + "fmt" + "io" + "slices" + "unsafe" + + "github.com/parquet-go/parquet-go/sparse" +) + +type fixedLenByteArrayColumnBuffer struct { + fixedLenByteArrayPage + tmp []byte +} + +func newFixedLenByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *fixedLenByteArrayColumnBuffer { + size := typ.Length() + return &fixedLenByteArrayColumnBuffer{ + fixedLenByteArrayPage: fixedLenByteArrayPage{ + typ: typ, + size: size, + data: make([]byte, 0, typ.EstimateSize(int(numValues))), + columnIndex: ^columnIndex, + }, + tmp: make([]byte, size), + } +} + +func (col *fixedLenByteArrayColumnBuffer) Clone() ColumnBuffer { + return &fixedLenByteArrayColumnBuffer{ + fixedLenByteArrayPage: fixedLenByteArrayPage{ + typ: col.typ, + size: col.size, + data: slices.Clone(col.data), + columnIndex: col.columnIndex, + }, + tmp: make([]byte, col.size), + } +} + +func (col *fixedLenByteArrayColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return fixedLenByteArrayColumnIndex{&col.fixedLenByteArrayPage}, nil +} + +func (col *fixedLenByteArrayColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return fixedLenByteArrayOffsetIndex{&col.fixedLenByteArrayPage}, nil +} + +func (col *fixedLenByteArrayColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *fixedLenByteArrayColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *fixedLenByteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *fixedLenByteArrayColumnBuffer) Page() Page { return &col.fixedLenByteArrayPage } + +func (col *fixedLenByteArrayColumnBuffer) Reset() { col.data = col.data[:0] } + +func (col *fixedLenByteArrayColumnBuffer) Cap() int { return cap(col.data) / col.size } + +func (col *fixedLenByteArrayColumnBuffer) Len() int { return len(col.data) / col.size } + +func (col *fixedLenByteArrayColumnBuffer) Less(i, j int) bool { + return bytes.Compare(col.index(i), col.index(j)) < 0 +} + +func (col *fixedLenByteArrayColumnBuffer) Swap(i, j int) { + t, u, v := col.tmp[:col.size], col.index(i), col.index(j) + copy(t, u) + copy(u, v) + copy(v, t) +} + +func (col *fixedLenByteArrayColumnBuffer) index(i int) []byte { + j := (i + 0) * col.size + k := (i + 1) * col.size + return col.data[j:k:k] +} + +func (col *fixedLenByteArrayColumnBuffer) Write(b []byte) (int, error) { + n, err := col.WriteFixedLenByteArrays(b) + return n * col.size, err +} + +func (col *fixedLenByteArrayColumnBuffer) WriteFixedLenByteArrays(values []byte) (int, error) { + if len(values) == 0 { + return 0, nil + } + d, m := len(values)/col.size, len(values)%col.size + if d == 0 || m != 0 { + return 0, fmt.Errorf("cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d", col.size, len(values)) + } + col.data = append(col.data, values...) + return d, nil +} + +func (col *fixedLenByteArrayColumnBuffer) WriteValues(values []Value) (int, error) { + for i, v := range values { + if n := len(v.byteArray()); n != col.size { + return i, fmt.Errorf("cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d", col.size, n) + } + col.data = append(col.data, v.byteArray()...) + } + return len(values), nil +} + +func (col *fixedLenByteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + n := col.size * rows.Len() + i := len(col.data) + j := len(col.data) + n + + if cap(col.data) < j { + col.data = append(make([]byte, 0, max(i+n, 2*cap(col.data))), col.data...) + } + + col.data = col.data[:j] + newData := col.data[i:] + + for i := range rows.Len() { + p := rows.Index(i) + copy(newData[i*col.size:], unsafe.Slice((*byte)(p), col.size)) + } +} + +func (col *fixedLenByteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) * col.size + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.data)/col.size)) + case i >= len(col.data): + return 0, io.EOF + default: + for n < len(values) && i < len(col.data) { + values[n] = col.makeValueBytes(col.data[i : i+col.size]) + n++ + i += col.size + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_float.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_float.go new file mode 100644 index 00000000000..6627a2f01fc --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_float.go @@ -0,0 +1,107 @@ +package parquet + +import ( + "fmt" + "io" + "slices" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/sparse" +) + +type floatColumnBuffer struct{ floatPage } + +func newFloatColumnBuffer(typ Type, columnIndex int16, numValues int32) *floatColumnBuffer { + return &floatColumnBuffer{ + floatPage: floatPage{ + typ: typ, + values: make([]float32, 0, numValues), + columnIndex: ^columnIndex, + }, + } +} + +func (col *floatColumnBuffer) Clone() ColumnBuffer { + return &floatColumnBuffer{ + floatPage: floatPage{ + typ: col.typ, + values: slices.Clone(col.values), + columnIndex: col.columnIndex, + }, + } +} + +func (col *floatColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return floatColumnIndex{&col.floatPage}, nil +} + +func (col *floatColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return floatOffsetIndex{&col.floatPage}, nil +} + +func (col *floatColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *floatColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *floatColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *floatColumnBuffer) Page() Page { return &col.floatPage } + +func (col *floatColumnBuffer) Reset() { col.values = col.values[:0] } + +func (col *floatColumnBuffer) Cap() int { return cap(col.values) } + +func (col *floatColumnBuffer) Len() int { return len(col.values) } + +func (col *floatColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } + +func (col *floatColumnBuffer) Swap(i, j int) { + col.values[i], col.values[j] = col.values[j], col.values[i] +} + +func (col *floatColumnBuffer) Write(b []byte) (int, error) { + if (len(b) % 4) != 0 { + return 0, fmt.Errorf("cannot write FLOAT values from input of size %d", len(b)) + } + col.values = append(col.values, unsafecast.Slice[float32](b)...) + return len(b), nil +} + +func (col *floatColumnBuffer) WriteFloats(values []float32) (int, error) { + col.values = append(col.values, values...) + return len(values), nil +} + +func (col *floatColumnBuffer) WriteValues(values []Value) (int, error) { + col.writeValues(makeArrayValue(values, offsetOfU32), columnLevels{}) + return len(values), nil +} + +func (col *floatColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + if n := len(col.values) + rows.Len(); n > cap(col.values) { + col.values = append(make([]float32, 0, max(n, 2*cap(col.values))), col.values...) + } + n := len(col.values) + col.values = col.values[:n+rows.Len()] + sparse.GatherFloat32(col.values[n:], rows.Float32Array()) +} + +func (col *floatColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) + case i >= len(col.values): + return 0, io.EOF + default: + for n < len(values) && i < len(col.values) { + values[n] = col.makeValue(col.values[i]) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_int32.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_int32.go new file mode 100644 index 00000000000..5efa0847266 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_int32.go @@ -0,0 +1,108 @@ +package parquet + +import ( + "fmt" + "io" + "slices" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/sparse" +) + +type int32ColumnBuffer struct{ int32Page } + +func newInt32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int32ColumnBuffer { + return &int32ColumnBuffer{ + int32Page: int32Page{ + typ: typ, + values: make([]int32, 0, numValues), + columnIndex: ^columnIndex, + }, + } +} + +func (col *int32ColumnBuffer) Clone() ColumnBuffer { + return &int32ColumnBuffer{ + int32Page: int32Page{ + typ: col.typ, + values: slices.Clone(col.values), + columnIndex: col.columnIndex, + }, + } +} + +func (col *int32ColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return int32ColumnIndex{&col.int32Page}, nil +} + +func (col *int32ColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return int32OffsetIndex{&col.int32Page}, nil +} + +func (col *int32ColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *int32ColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *int32ColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *int32ColumnBuffer) Page() Page { return &col.int32Page } + +func (col *int32ColumnBuffer) Reset() { col.values = col.values[:0] } + +func (col *int32ColumnBuffer) Cap() int { return cap(col.values) } + +func (col *int32ColumnBuffer) Len() int { return len(col.values) } + +func (col *int32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } + +func (col *int32ColumnBuffer) Swap(i, j int) { + col.values[i], col.values[j] = col.values[j], col.values[i] +} + +func (col *int32ColumnBuffer) Write(b []byte) (int, error) { + if (len(b) % 4) != 0 { + return 0, fmt.Errorf("cannot write INT32 values from input of size %d", len(b)) + } + col.values = append(col.values, unsafecast.Slice[int32](b)...) + return len(b), nil +} + +func (col *int32ColumnBuffer) WriteInt32s(values []int32) (int, error) { + col.values = append(col.values, values...) + return len(values), nil +} + +func (col *int32ColumnBuffer) WriteValues(values []Value) (int, error) { + col.writeValues(makeArrayValue(values, offsetOfU32), columnLevels{}) + return len(values), nil +} + +func (col *int32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + if n := len(col.values) + rows.Len(); n > cap(col.values) { + col.values = append(make([]int32, 0, max(n, 2*cap(col.values))), col.values...) + } + n := len(col.values) + col.values = col.values[:n+rows.Len()] + sparse.GatherInt32(col.values[n:], rows.Int32Array()) + +} + +func (col *int32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) + case i >= len(col.values): + return 0, io.EOF + default: + for n < len(values) && i < len(col.values) { + values[n] = col.makeValue(col.values[i]) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_int64.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_int64.go new file mode 100644 index 00000000000..0aac0ecb9e2 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_int64.go @@ -0,0 +1,107 @@ +package parquet + +import ( + "fmt" + "io" + "slices" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/sparse" +) + +type int64ColumnBuffer struct{ int64Page } + +func newInt64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int64ColumnBuffer { + return &int64ColumnBuffer{ + int64Page: int64Page{ + typ: typ, + values: make([]int64, 0, numValues), + columnIndex: ^columnIndex, + }, + } +} + +func (col *int64ColumnBuffer) Clone() ColumnBuffer { + return &int64ColumnBuffer{ + int64Page: int64Page{ + typ: col.typ, + values: slices.Clone(col.values), + columnIndex: col.columnIndex, + }, + } +} + +func (col *int64ColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return int64ColumnIndex{&col.int64Page}, nil +} + +func (col *int64ColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return int64OffsetIndex{&col.int64Page}, nil +} + +func (col *int64ColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *int64ColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *int64ColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *int64ColumnBuffer) Page() Page { return &col.int64Page } + +func (col *int64ColumnBuffer) Reset() { col.values = col.values[:0] } + +func (col *int64ColumnBuffer) Cap() int { return cap(col.values) } + +func (col *int64ColumnBuffer) Len() int { return len(col.values) } + +func (col *int64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } + +func (col *int64ColumnBuffer) Swap(i, j int) { + col.values[i], col.values[j] = col.values[j], col.values[i] +} + +func (col *int64ColumnBuffer) Write(b []byte) (int, error) { + if (len(b) % 8) != 0 { + return 0, fmt.Errorf("cannot write INT64 values from input of size %d", len(b)) + } + col.values = append(col.values, unsafecast.Slice[int64](b)...) + return len(b), nil +} + +func (col *int64ColumnBuffer) WriteInt64s(values []int64) (int, error) { + col.values = append(col.values, values...) + return len(values), nil +} + +func (col *int64ColumnBuffer) WriteValues(values []Value) (int, error) { + col.writeValues(makeArrayValue(values, offsetOfU64), columnLevels{}) + return len(values), nil +} + +func (col *int64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + if n := len(col.values) + rows.Len(); n > cap(col.values) { + col.values = append(make([]int64, 0, max(n, 2*cap(col.values))), col.values...) + } + n := len(col.values) + col.values = col.values[:n+rows.Len()] + sparse.GatherInt64(col.values[n:], rows.Int64Array()) +} + +func (col *int64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) + case i >= len(col.values): + return 0, io.EOF + default: + for n < len(values) && i < len(col.values) { + values[n] = col.makeValue(col.values[i]) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_int96.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_int96.go new file mode 100644 index 00000000000..17192fcc883 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_int96.go @@ -0,0 +1,108 @@ +package parquet + +import ( + "fmt" + "io" + "slices" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/sparse" +) + +type int96ColumnBuffer struct{ int96Page } + +func newInt96ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int96ColumnBuffer { + return &int96ColumnBuffer{ + int96Page: int96Page{ + typ: typ, + values: make([]deprecated.Int96, 0, numValues), + columnIndex: ^columnIndex, + }, + } +} + +func (col *int96ColumnBuffer) Clone() ColumnBuffer { + return &int96ColumnBuffer{ + int96Page: int96Page{ + typ: col.typ, + values: slices.Clone(col.values), + columnIndex: col.columnIndex, + }, + } +} + +func (col *int96ColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return int96ColumnIndex{&col.int96Page}, nil +} + +func (col *int96ColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return int96OffsetIndex{&col.int96Page}, nil +} + +func (col *int96ColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *int96ColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *int96ColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *int96ColumnBuffer) Page() Page { return &col.int96Page } + +func (col *int96ColumnBuffer) Reset() { col.values = col.values[:0] } + +func (col *int96ColumnBuffer) Cap() int { return cap(col.values) } + +func (col *int96ColumnBuffer) Len() int { return len(col.values) } + +func (col *int96ColumnBuffer) Less(i, j int) bool { return col.values[i].Less(col.values[j]) } + +func (col *int96ColumnBuffer) Swap(i, j int) { + col.values[i], col.values[j] = col.values[j], col.values[i] +} + +func (col *int96ColumnBuffer) Write(b []byte) (int, error) { + if (len(b) % 12) != 0 { + return 0, fmt.Errorf("cannot write INT96 values from input of size %d", len(b)) + } + col.values = append(col.values, unsafecast.Slice[deprecated.Int96](b)...) + return len(b), nil +} + +func (col *int96ColumnBuffer) WriteInt96s(values []deprecated.Int96) (int, error) { + col.values = append(col.values, values...) + return len(values), nil +} + +func (col *int96ColumnBuffer) WriteValues(values []Value) (int, error) { + for _, v := range values { + col.values = append(col.values, v.Int96()) + } + return len(values), nil +} + +func (col *int96ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + for i := range rows.Len() { + p := rows.Index(i) + col.values = append(col.values, *(*deprecated.Int96)(p)) + } +} + +func (col *int96ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) + case i >= len(col.values): + return 0, io.EOF + default: + for n < len(values) && i < len(col.values) { + values[n] = col.makeValue(col.values[i]) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_optional.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_optional.go new file mode 100644 index 00000000000..77bfb6c83c5 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_optional.go @@ -0,0 +1,281 @@ +package parquet + +import ( + "io" + "slices" + + "github.com/parquet-go/parquet-go/sparse" +) + +// optionalColumnBuffer is an implementation of the ColumnBuffer interface used +// as a wrapper to an underlying ColumnBuffer to manage the creation of +// definition levels. +// +// Null values are not written to the underlying column; instead, the buffer +// tracks offsets of row values in the column, null row values are represented +// by the value -1 and a definition level less than the max. +// +// This column buffer type is used for all leaf columns that have a non-zero +// max definition level and a zero repetition level, which may be because the +// column or one of its parent(s) are marked optional. +type optionalColumnBuffer struct { + base ColumnBuffer + reordered bool + maxDefinitionLevel byte + rows []int32 + sortIndex []int32 + definitionLevels []byte + nullOrdering nullOrdering +} + +func newOptionalColumnBuffer(base ColumnBuffer, rows []int32, levels []byte, maxDefinitionLevel byte, nullOrdering nullOrdering) *optionalColumnBuffer { + return &optionalColumnBuffer{ + base: base, + rows: rows, + maxDefinitionLevel: maxDefinitionLevel, + definitionLevels: levels, + nullOrdering: nullOrdering, + } +} + +func (col *optionalColumnBuffer) Clone() ColumnBuffer { + return &optionalColumnBuffer{ + base: col.base.Clone(), + reordered: col.reordered, + maxDefinitionLevel: col.maxDefinitionLevel, + rows: slices.Clone(col.rows), + definitionLevels: slices.Clone(col.definitionLevels), + nullOrdering: col.nullOrdering, + } +} + +func (col *optionalColumnBuffer) Type() Type { + return col.base.Type() +} + +func (col *optionalColumnBuffer) NumValues() int64 { + return int64(len(col.definitionLevels)) +} + +func (col *optionalColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels) +} + +func (col *optionalColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return col.base.OffsetIndex() +} + +func (col *optionalColumnBuffer) BloomFilter() BloomFilter { + return col.base.BloomFilter() +} + +func (col *optionalColumnBuffer) Dictionary() Dictionary { + return col.base.Dictionary() +} + +func (col *optionalColumnBuffer) Column() int { + return col.base.Column() +} + +func (col *optionalColumnBuffer) Pages() Pages { + return onePage(col.Page()) +} + +func (col *optionalColumnBuffer) Page() Page { + // No need for any cyclic sorting if the rows have not been reordered. + // This case is also important because the cyclic sorting modifies the + // buffer which makes it unsafe to read the buffer concurrently. + if col.reordered { + numNulls := countLevelsNotEqual(col.definitionLevels, col.maxDefinitionLevel) + numValues := len(col.rows) - numNulls + + if numValues > 0 { + if cap(col.sortIndex) < numValues { + col.sortIndex = make([]int32, numValues) + } + sortIndex := col.sortIndex[:numValues] + i := 0 + for _, j := range col.rows { + if j >= 0 { + sortIndex[j] = int32(i) + i++ + } + } + + // Cyclic sort: O(N) + for i := range sortIndex { + for j := int(sortIndex[i]); i != j; j = int(sortIndex[i]) { + col.base.Swap(i, j) + sortIndex[i], sortIndex[j] = sortIndex[j], sortIndex[i] + } + } + } + + i := 0 + for _, r := range col.rows { + if r >= 0 { + col.rows[i] = int32(i) + i++ + } + } + + col.reordered = false + } + + return newOptionalPage(col.base.Page(), col.maxDefinitionLevel, col.definitionLevels) +} + +func (col *optionalColumnBuffer) Reset() { + col.base.Reset() + col.rows = col.rows[:0] + col.definitionLevels = col.definitionLevels[:0] +} + +func (col *optionalColumnBuffer) Size() int64 { + return int64(4*len(col.rows)+4*len(col.sortIndex)+len(col.definitionLevels)) + col.base.Size() +} + +func (col *optionalColumnBuffer) Cap() int { return cap(col.rows) } + +func (col *optionalColumnBuffer) Len() int { return len(col.rows) } + +func (col *optionalColumnBuffer) Less(i, j int) bool { + return col.nullOrdering( + col.base, + int(col.rows[i]), + int(col.rows[j]), + col.maxDefinitionLevel, + col.definitionLevels[i], + col.definitionLevels[j], + ) +} + +func (col *optionalColumnBuffer) Swap(i, j int) { + // Because the underlying column does not contain null values, we cannot + // swap its values at indexes i and j. We swap the row indexes only, then + // reorder the underlying buffer using a cyclic sort when the buffer is + // materialized into a page view. + col.reordered = true + col.rows[i], col.rows[j] = col.rows[j], col.rows[i] + col.definitionLevels[i], col.definitionLevels[j] = col.definitionLevels[j], col.definitionLevels[i] +} + +func (col *optionalColumnBuffer) WriteValues(values []Value) (n int, err error) { + rowIndex := int32(col.base.Len()) + + for n < len(values) { + // Collect index range of contiguous null values, from i to n. If this + // for loop exhausts the values, all remaining if statements and for + // loops will be no-ops and the loop will terminate. + i := n + for n < len(values) && values[n].definitionLevel != col.maxDefinitionLevel { + n++ + } + + // Write the contiguous null values up until the first non-null value + // obtained in the for loop above. + for _, v := range values[i:n] { + col.rows = append(col.rows, -1) + col.definitionLevels = append(col.definitionLevels, v.definitionLevel) + } + + // Collect index range of contiguous non-null values, from i to n. + i = n + for n < len(values) && values[n].definitionLevel == col.maxDefinitionLevel { + n++ + } + + // As long as i < n we have non-null values still to write. It is + // possible that we just exhausted the input values in which case i == n + // and the outer for loop will terminate. + if i < n { + count, err := col.base.WriteValues(values[i:n]) + col.definitionLevels = appendLevel(col.definitionLevels, col.maxDefinitionLevel, count) + + for count > 0 { + col.rows = append(col.rows, rowIndex) + rowIndex++ + count-- + } + + if err != nil { + return n, err + } + } + } + return n, nil +} + +func (col *optionalColumnBuffer) writeValues(rows sparse.Array, levels columnLevels) { + // The row count is zero when writing an null optional value, in which case + // we still need to output a row to the buffer to record the definition + // level. + if rows.Len() == 0 { + col.definitionLevels = append(col.definitionLevels, levels.definitionLevel) + col.rows = append(col.rows, -1) + return + } + + col.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, rows.Len()) + + i := len(col.rows) + j := len(col.rows) + rows.Len() + + if j <= cap(col.rows) { + col.rows = col.rows[:j] + } else { + tmp := make([]int32, j, 2*j) + copy(tmp, col.rows) + col.rows = tmp + } + + if levels.definitionLevel != col.maxDefinitionLevel { + broadcastValueInt32(col.rows[i:], -1) + } else { + broadcastRangeInt32(col.rows[i:], int32(col.base.Len())) + col.base.writeValues(rows, levels) + } +} + +func (col *optionalColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) { + length := int64(len(col.definitionLevels)) + if offset < 0 { + return 0, errRowIndexOutOfBounds(offset, length) + } + if offset >= length { + return 0, io.EOF + } + if length -= offset; length < int64(len(values)) { + values = values[:length] + } + + numNulls1 := int64(countLevelsNotEqual(col.definitionLevels[:offset], col.maxDefinitionLevel)) + numNulls2 := int64(countLevelsNotEqual(col.definitionLevels[offset:offset+length], col.maxDefinitionLevel)) + + if numNulls2 < length { + n, err := col.base.ReadValuesAt(values[:length-numNulls2], offset-numNulls1) + if err != nil { + return n, err + } + } + + if numNulls2 > 0 { + columnIndex := ^int16(col.Column()) + i := numNulls2 - 1 + j := length - 1 + definitionLevels := col.definitionLevels[offset : offset+length] + maxDefinitionLevel := col.maxDefinitionLevel + + for n := len(definitionLevels) - 1; n >= 0 && j > i; n-- { + if definitionLevels[n] != maxDefinitionLevel { + values[j] = Value{definitionLevel: definitionLevels[n], columnIndex: columnIndex} + } else { + values[j] = values[i] + i-- + } + j-- + } + } + + return int(length), nil +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_reflect.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_reflect.go new file mode 100644 index 00000000000..03f61dc51c7 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_reflect.go @@ -0,0 +1,134 @@ +package parquet + +import ( + "cmp" + "math/bits" + "reflect" + "sort" + "unsafe" + + "github.com/parquet-go/parquet-go/sparse" +) + +type anymap interface { + entries() (keys, values sparse.Array) +} + +type gomap[K cmp.Ordered] struct { + keys []K + vals reflect.Value // slice + swap func(int, int) + size uintptr +} + +func (m *gomap[K]) Len() int { return len(m.keys) } + +func (m *gomap[K]) Less(i, j int) bool { return cmp.Compare(m.keys[i], m.keys[j]) < 0 } + +func (m *gomap[K]) Swap(i, j int) { + m.keys[i], m.keys[j] = m.keys[j], m.keys[i] + m.swap(i, j) +} + +func (m *gomap[K]) entries() (keys, values sparse.Array) { + return makeArrayOf(m.keys), makeArray(m.vals.UnsafePointer(), m.Len(), m.size) +} + +type reflectMap struct { + keys reflect.Value // slice + vals reflect.Value // slice + numKeys int + keySize uintptr + valSize uintptr +} + +func (m *reflectMap) entries() (keys, values sparse.Array) { + return makeArray(m.keys.UnsafePointer(), m.numKeys, m.keySize), makeArray(m.vals.UnsafePointer(), m.numKeys, m.valSize) +} + +func makeMapFuncOf(mapType reflect.Type) func(reflect.Value) anymap { + switch mapType.Key().Kind() { + case reflect.Int: + return makeMapFunc[int](mapType) + case reflect.Int8: + return makeMapFunc[int8](mapType) + case reflect.Int16: + return makeMapFunc[int16](mapType) + case reflect.Int32: + return makeMapFunc[int32](mapType) + case reflect.Int64: + return makeMapFunc[int64](mapType) + case reflect.Uint: + return makeMapFunc[uint](mapType) + case reflect.Uint8: + return makeMapFunc[uint8](mapType) + case reflect.Uint16: + return makeMapFunc[uint16](mapType) + case reflect.Uint32: + return makeMapFunc[uint32](mapType) + case reflect.Uint64: + return makeMapFunc[uint64](mapType) + case reflect.Uintptr: + return makeMapFunc[uintptr](mapType) + case reflect.Float32: + return makeMapFunc[float32](mapType) + case reflect.Float64: + return makeMapFunc[float64](mapType) + case reflect.String: + return makeMapFunc[string](mapType) + } + + keyType := mapType.Key() + valType := mapType.Elem() + + mapBuffer := &reflectMap{ + keySize: keyType.Size(), + valSize: valType.Size(), + } + + keySliceType := reflect.SliceOf(keyType) + valSliceType := reflect.SliceOf(valType) + return func(mapValue reflect.Value) anymap { + length := mapValue.Len() + + if !mapBuffer.keys.IsValid() || mapBuffer.keys.Len() < length { + capacity := 1 << bits.Len(uint(length)) + mapBuffer.keys = reflect.MakeSlice(keySliceType, capacity, capacity) + mapBuffer.vals = reflect.MakeSlice(valSliceType, capacity, capacity) + } + + mapBuffer.numKeys = length + for i, mapIter := 0, mapValue.MapRange(); mapIter.Next(); i++ { + mapBuffer.keys.Index(i).SetIterKey(mapIter) + mapBuffer.vals.Index(i).SetIterValue(mapIter) + } + + return mapBuffer + } +} + +func makeMapFunc[K cmp.Ordered](mapType reflect.Type) func(reflect.Value) anymap { + keyType := mapType.Key() + valType := mapType.Elem() + valSliceType := reflect.SliceOf(valType) + mapBuffer := &gomap[K]{size: valType.Size()} + return func(mapValue reflect.Value) anymap { + length := mapValue.Len() + + if cap(mapBuffer.keys) < length { + capacity := 1 << bits.Len(uint(length)) + mapBuffer.keys = make([]K, capacity) + mapBuffer.vals = reflect.MakeSlice(valSliceType, capacity, capacity) + mapBuffer.swap = reflect.Swapper(mapBuffer.vals.Interface()) + } + + mapBuffer.keys = mapBuffer.keys[:length] + for i, mapIter := 0, mapValue.MapRange(); mapIter.Next(); i++ { + reflect.NewAt(keyType, unsafe.Pointer(&mapBuffer.keys[i])).Elem().SetIterKey(mapIter) + mapBuffer.vals.Index(i).SetIterValue(mapIter) + } + + sort.Sort(mapBuffer) + return mapBuffer + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_repeated.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_repeated.go new file mode 100644 index 00000000000..5b68b4b243f --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_repeated.go @@ -0,0 +1,322 @@ +package parquet + +import ( + "bytes" + "slices" + + "github.com/parquet-go/parquet-go/sparse" +) + +// repeatedColumnBuffer is an implementation of the ColumnBuffer interface used +// as a wrapper to an underlying ColumnBuffer to manage the creation of +// repetition levels, definition levels, and map rows to the region of the +// underlying buffer that contains their sequence of values. +// +// Null values are not written to the underlying column; instead, the buffer +// tracks offsets of row values in the column, null row values are represented +// by the value -1 and a definition level less than the max. +// +// This column buffer type is used for all leaf columns that have a non-zero +// max repetition level, which may be because the column or one of its parent(s) +// are marked repeated. +type repeatedColumnBuffer struct { + base ColumnBuffer + reordered bool + maxRepetitionLevel byte + maxDefinitionLevel byte + rows []offsetMapping + repetitionLevels []byte + definitionLevels []byte + buffer []Value + reordering *repeatedColumnBuffer + nullOrdering nullOrdering +} + +// The offsetMapping type maps the logical offset of rows within the repetition +// and definition levels, to the base offsets in the underlying column buffers +// where the non-null values have been written. +type offsetMapping struct { + offset uint32 + baseOffset uint32 +} + +func newRepeatedColumnBuffer(base ColumnBuffer, repetitionLevels, definitionLevels []byte, maxRepetitionLevel, maxDefinitionLevel byte, nullOrdering nullOrdering) *repeatedColumnBuffer { + n := base.Cap() + return &repeatedColumnBuffer{ + base: base, + maxRepetitionLevel: maxRepetitionLevel, + maxDefinitionLevel: maxDefinitionLevel, + rows: make([]offsetMapping, 0, n/8), + repetitionLevels: repetitionLevels, + definitionLevels: definitionLevels, + nullOrdering: nullOrdering, + } +} + +func (col *repeatedColumnBuffer) Clone() ColumnBuffer { + return &repeatedColumnBuffer{ + base: col.base.Clone(), + reordered: col.reordered, + maxRepetitionLevel: col.maxRepetitionLevel, + maxDefinitionLevel: col.maxDefinitionLevel, + rows: slices.Clone(col.rows), + repetitionLevels: slices.Clone(col.repetitionLevels), + definitionLevels: slices.Clone(col.definitionLevels), + nullOrdering: col.nullOrdering, + } +} + +func (col *repeatedColumnBuffer) Type() Type { + return col.base.Type() +} + +func (col *repeatedColumnBuffer) NumValues() int64 { + return int64(len(col.definitionLevels)) +} + +func (col *repeatedColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels) +} + +func (col *repeatedColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return col.base.OffsetIndex() +} + +func (col *repeatedColumnBuffer) BloomFilter() BloomFilter { + return col.base.BloomFilter() +} + +func (col *repeatedColumnBuffer) Dictionary() Dictionary { + return col.base.Dictionary() +} + +func (col *repeatedColumnBuffer) Column() int { + return col.base.Column() +} + +func (col *repeatedColumnBuffer) Pages() Pages { + return onePage(col.Page()) +} + +func (col *repeatedColumnBuffer) Page() Page { + if col.reordered { + if col.reordering == nil { + col.reordering = col.Clone().(*repeatedColumnBuffer) + } + + column := col.reordering + column.Reset() + maxNumValues := 0 + defer func() { + clearValues(col.buffer[:maxNumValues]) + }() + + baseOffset := 0 + + for _, row := range col.rows { + rowOffset := int(row.offset) + rowLength := repeatedRowLength(col.repetitionLevels[rowOffset:]) + numNulls := countLevelsNotEqual(col.definitionLevels[rowOffset:rowOffset+rowLength], col.maxDefinitionLevel) + numValues := rowLength - numNulls + + if numValues > 0 { + if numValues > cap(col.buffer) { + col.buffer = make([]Value, numValues) + } else { + col.buffer = col.buffer[:numValues] + } + n, err := col.base.ReadValuesAt(col.buffer, int64(row.baseOffset)) + if err != nil && n < numValues { + return newErrorPage(col.Type(), col.Column(), "reordering rows of repeated column: %w", err) + } + if _, err := column.base.WriteValues(col.buffer); err != nil { + return newErrorPage(col.Type(), col.Column(), "reordering rows of repeated column: %w", err) + } + if numValues > maxNumValues { + maxNumValues = numValues + } + } + + column.rows = append(column.rows, offsetMapping{ + offset: uint32(len(column.repetitionLevels)), + baseOffset: uint32(baseOffset), + }) + + column.repetitionLevels = append(column.repetitionLevels, col.repetitionLevels[rowOffset:rowOffset+rowLength]...) + column.definitionLevels = append(column.definitionLevels, col.definitionLevels[rowOffset:rowOffset+rowLength]...) + baseOffset += numValues + } + + col.swapReorderingBuffer(column) + col.reordered = false + } + + return newRepeatedPage( + col.base.Page(), + col.maxRepetitionLevel, + col.maxDefinitionLevel, + col.repetitionLevels, + col.definitionLevels, + ) +} + +func (col *repeatedColumnBuffer) swapReorderingBuffer(buf *repeatedColumnBuffer) { + col.base, buf.base = buf.base, col.base + col.rows, buf.rows = buf.rows, col.rows + col.repetitionLevels, buf.repetitionLevels = buf.repetitionLevels, col.repetitionLevels + col.definitionLevels, buf.definitionLevels = buf.definitionLevels, col.definitionLevels +} + +func (col *repeatedColumnBuffer) Reset() { + col.base.Reset() + col.rows = col.rows[:0] + col.repetitionLevels = col.repetitionLevels[:0] + col.definitionLevels = col.definitionLevels[:0] +} + +func (col *repeatedColumnBuffer) Size() int64 { + return int64(8*len(col.rows)+len(col.repetitionLevels)+len(col.definitionLevels)) + col.base.Size() +} + +func (col *repeatedColumnBuffer) Cap() int { return cap(col.rows) } + +func (col *repeatedColumnBuffer) Len() int { return len(col.rows) } + +func (col *repeatedColumnBuffer) Less(i, j int) bool { + row1 := col.rows[i] + row2 := col.rows[j] + less := col.nullOrdering + row1Length := repeatedRowLength(col.repetitionLevels[row1.offset:]) + row2Length := repeatedRowLength(col.repetitionLevels[row2.offset:]) + + for k := 0; k < row1Length && k < row2Length; k++ { + x := int(row1.baseOffset) + y := int(row2.baseOffset) + definitionLevel1 := col.definitionLevels[int(row1.offset)+k] + definitionLevel2 := col.definitionLevels[int(row2.offset)+k] + switch { + case less(col.base, x, y, col.maxDefinitionLevel, definitionLevel1, definitionLevel2): + return true + case less(col.base, y, x, col.maxDefinitionLevel, definitionLevel2, definitionLevel1): + return false + } + } + + return row1Length < row2Length +} + +func (col *repeatedColumnBuffer) Swap(i, j int) { + // Because the underlying column does not contain null values, and may hold + // an arbitrary number of values per row, we cannot swap its values at + // indexes i and j. We swap the row indexes only, then reorder the base + // column buffer when its view is materialized into a page by creating a + // copy and writing rows back to it following the order of rows in the + // repeated column buffer. + col.reordered = true + col.rows[i], col.rows[j] = col.rows[j], col.rows[i] +} + +func (col *repeatedColumnBuffer) WriteValues(values []Value) (numValues int, err error) { + maxRowLen := 0 + defer func() { + clearValues(col.buffer[:maxRowLen]) + }() + + for i := 0; i < len(values); { + j := i + + if values[j].repetitionLevel == 0 { + j++ + } + + for j < len(values) && values[j].repetitionLevel != 0 { + j++ + } + + if err := col.writeRow(values[i:j]); err != nil { + return numValues, err + } + + if len(col.buffer) > maxRowLen { + maxRowLen = len(col.buffer) + } + + numValues += j - i + i = j + } + + return numValues, nil +} + +func (col *repeatedColumnBuffer) writeRow(row []Value) error { + col.buffer = col.buffer[:0] + + for _, v := range row { + if v.definitionLevel == col.maxDefinitionLevel { + col.buffer = append(col.buffer, v) + } + } + + baseOffset := col.base.NumValues() + if len(col.buffer) > 0 { + if _, err := col.base.WriteValues(col.buffer); err != nil { + return err + } + } + + if row[0].repetitionLevel == 0 { + col.rows = append(col.rows, offsetMapping{ + offset: uint32(len(col.repetitionLevels)), + baseOffset: uint32(baseOffset), + }) + } + + for _, v := range row { + col.repetitionLevels = append(col.repetitionLevels, v.repetitionLevel) + col.definitionLevels = append(col.definitionLevels, v.definitionLevel) + } + + return nil +} + +func (col *repeatedColumnBuffer) writeValues(row sparse.Array, levels columnLevels) { + if levels.repetitionLevel == 0 { + col.rows = append(col.rows, offsetMapping{ + offset: uint32(len(col.repetitionLevels)), + baseOffset: uint32(col.base.NumValues()), + }) + } + + if row.Len() == 0 { + col.repetitionLevels = append(col.repetitionLevels, levels.repetitionLevel) + col.definitionLevels = append(col.definitionLevels, levels.definitionLevel) + return + } + + col.repetitionLevels = appendLevel(col.repetitionLevels, levels.repetitionLevel, row.Len()) + col.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, row.Len()) + + if levels.definitionLevel == col.maxDefinitionLevel { + col.base.writeValues(row, levels) + } +} + +func (col *repeatedColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) { + // TODO: + panic("NOT IMPLEMENTED") +} + +// repeatedRowLength gives the length of the repeated row starting at the +// beginning of the repetitionLevels slice. +func repeatedRowLength(repetitionLevels []byte) int { + // If a repetition level exists, at least one value is required to represent + // the column. + if len(repetitionLevels) > 0 { + // The subsequent levels will represent the start of a new record when + // they go back to zero. + if i := bytes.IndexByte(repetitionLevels[1:], 0); i >= 0 { + return i + 1 + } + } + return len(repetitionLevels) +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_uint32.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_uint32.go new file mode 100644 index 00000000000..a499a1ef762 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_uint32.go @@ -0,0 +1,107 @@ +package parquet + +import ( + "fmt" + "io" + "slices" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/sparse" +) + +type uint32ColumnBuffer struct{ uint32Page } + +func newUint32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint32ColumnBuffer { + return &uint32ColumnBuffer{ + uint32Page: uint32Page{ + typ: typ, + values: make([]uint32, 0, numValues), + columnIndex: ^columnIndex, + }, + } +} + +func (col *uint32ColumnBuffer) Clone() ColumnBuffer { + return &uint32ColumnBuffer{ + uint32Page: uint32Page{ + typ: col.typ, + values: slices.Clone(col.values), + columnIndex: col.columnIndex, + }, + } +} + +func (col *uint32ColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return uint32ColumnIndex{&col.uint32Page}, nil +} + +func (col *uint32ColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return uint32OffsetIndex{&col.uint32Page}, nil +} + +func (col *uint32ColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *uint32ColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *uint32ColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *uint32ColumnBuffer) Page() Page { return &col.uint32Page } + +func (col *uint32ColumnBuffer) Reset() { col.values = col.values[:0] } + +func (col *uint32ColumnBuffer) Cap() int { return cap(col.values) } + +func (col *uint32ColumnBuffer) Len() int { return len(col.values) } + +func (col *uint32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } + +func (col *uint32ColumnBuffer) Swap(i, j int) { + col.values[i], col.values[j] = col.values[j], col.values[i] +} + +func (col *uint32ColumnBuffer) Write(b []byte) (int, error) { + if (len(b) % 4) != 0 { + return 0, fmt.Errorf("cannot write INT32 values from input of size %d", len(b)) + } + col.values = append(col.values, unsafecast.Slice[uint32](b)...) + return len(b), nil +} + +func (col *uint32ColumnBuffer) WriteUint32s(values []uint32) (int, error) { + col.values = append(col.values, values...) + return len(values), nil +} + +func (col *uint32ColumnBuffer) WriteValues(values []Value) (int, error) { + col.writeValues(makeArrayValue(values, offsetOfU32), columnLevels{}) + return len(values), nil +} + +func (col *uint32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + if n := len(col.values) + rows.Len(); n > cap(col.values) { + col.values = append(make([]uint32, 0, max(n, 2*cap(col.values))), col.values...) + } + n := len(col.values) + col.values = col.values[:n+rows.Len()] + sparse.GatherUint32(col.values[n:], rows.Uint32Array()) +} + +func (col *uint32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) + case i >= len(col.values): + return 0, io.EOF + default: + for n < len(values) && i < len(col.values) { + values[n] = col.makeValue(col.values[i]) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_uint64.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_uint64.go new file mode 100644 index 00000000000..a75b0e91ef1 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_uint64.go @@ -0,0 +1,107 @@ +package parquet + +import ( + "fmt" + "io" + "slices" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/sparse" +) + +type uint64ColumnBuffer struct{ uint64Page } + +func newUint64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint64ColumnBuffer { + return &uint64ColumnBuffer{ + uint64Page: uint64Page{ + typ: typ, + values: make([]uint64, 0, numValues), + columnIndex: ^columnIndex, + }, + } +} + +func (col *uint64ColumnBuffer) Clone() ColumnBuffer { + return &uint64ColumnBuffer{ + uint64Page: uint64Page{ + typ: col.typ, + values: slices.Clone(col.values), + columnIndex: col.columnIndex, + }, + } +} + +func (col *uint64ColumnBuffer) ColumnIndex() (ColumnIndex, error) { + return uint64ColumnIndex{&col.uint64Page}, nil +} + +func (col *uint64ColumnBuffer) OffsetIndex() (OffsetIndex, error) { + return uint64OffsetIndex{&col.uint64Page}, nil +} + +func (col *uint64ColumnBuffer) BloomFilter() BloomFilter { return nil } + +func (col *uint64ColumnBuffer) Dictionary() Dictionary { return nil } + +func (col *uint64ColumnBuffer) Pages() Pages { return onePage(col.Page()) } + +func (col *uint64ColumnBuffer) Page() Page { return &col.uint64Page } + +func (col *uint64ColumnBuffer) Reset() { col.values = col.values[:0] } + +func (col *uint64ColumnBuffer) Cap() int { return cap(col.values) } + +func (col *uint64ColumnBuffer) Len() int { return len(col.values) } + +func (col *uint64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } + +func (col *uint64ColumnBuffer) Swap(i, j int) { + col.values[i], col.values[j] = col.values[j], col.values[i] +} + +func (col *uint64ColumnBuffer) Write(b []byte) (int, error) { + if (len(b) % 8) != 0 { + return 0, fmt.Errorf("cannot write INT64 values from input of size %d", len(b)) + } + col.values = append(col.values, unsafecast.Slice[uint64](b)...) + return len(b), nil +} + +func (col *uint64ColumnBuffer) WriteUint64s(values []uint64) (int, error) { + col.values = append(col.values, values...) + return len(values), nil +} + +func (col *uint64ColumnBuffer) WriteValues(values []Value) (int, error) { + col.writeValues(makeArrayValue(values, offsetOfU64), columnLevels{}) + return len(values), nil +} + +func (col *uint64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { + if n := len(col.values) + rows.Len(); n > cap(col.values) { + col.values = append(make([]uint64, 0, max(n, 2*cap(col.values))), col.values...) + } + n := len(col.values) + col.values = col.values[:n+rows.Len()] + sparse.GatherUint64(col.values[n:], rows.Uint64Array()) +} + +func (col *uint64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { + i := int(offset) + switch { + case i < 0: + return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) + case i >= len(col.values): + return 0, io.EOF + default: + for n < len(values) && i < len(col.values) { + values[n] = col.makeValue(col.values[i]) + n++ + i++ + } + if n < len(values) { + err = io.EOF + } + return n, err + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_buffer_write.go b/vendor/github.com/parquet-go/parquet-go/column_buffer_write.go new file mode 100644 index 00000000000..e94daedee19 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/column_buffer_write.go @@ -0,0 +1,834 @@ +package parquet + +import ( + "encoding/json" + "fmt" + "hash/maphash" + "math/bits" + "reflect" + "slices" + "sync" + "time" + "unsafe" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/sparse" +) + +// writeRowsFunc is the type of functions that apply rows to a set of column +// buffers. +// +// - columns is the array of column buffer where the rows are written. +// +// - rows is the array of Go values to write to the column buffers. +// +// - levels is used to track the column index, repetition and definition levels +// of values when writing optional or repeated columns. +type writeRowsFunc func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error + +// writeRowsFuncOf generates a writeRowsFunc function for the given Go type and +// parquet schema. The column path indicates the column that the function is +// being generated for in the parquet schema. +func writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath, tagReplacements []StructTagOption) writeRowsFunc { + if leaf, exists := schema.Lookup(path...); exists && leaf.Node.Type().LogicalType() != nil && leaf.Node.Type().LogicalType().Json != nil { + return writeRowsFuncOfJSON(t, schema, path) + } + + switch t { + case reflect.TypeOf(deprecated.Int96{}): + return writeRowsFuncOfRequired(t, schema, path) + case reflect.TypeOf(time.Time{}): + return writeRowsFuncOfTime(t, schema, path, tagReplacements) + } + + switch t.Kind() { + case reflect.Bool, + reflect.Int, + reflect.Uint, + reflect.Int32, + reflect.Uint32, + reflect.Int64, + reflect.Uint64, + reflect.Float32, + reflect.Float64, + reflect.String: + return writeRowsFuncOfRequired(t, schema, path) + + case reflect.Slice: + if t.Elem().Kind() == reflect.Uint8 { + return writeRowsFuncOfRequired(t, schema, path) + } else { + return writeRowsFuncOfSlice(t, schema, path, tagReplacements) + } + + case reflect.Array: + if t.Elem().Kind() == reflect.Uint8 { + return writeRowsFuncOfArray(t, schema, path) + } + + case reflect.Pointer: + return writeRowsFuncOfPointer(t, schema, path, tagReplacements) + + case reflect.Struct: + return writeRowsFuncOfStruct(t, schema, path, tagReplacements) + + case reflect.Map: + return writeRowsFuncOfMap(t, schema, path, tagReplacements) + } + + panic("cannot convert Go values of type " + typeNameOf(t) + " to parquet value") +} + +func writeRowsFuncOfRequired(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { + column := schema.lazyLoadState().mapping.lookup(path) + columnIndex := column.columnIndex + if columnIndex < 0 { + panic("parquet: column not found: " + path.String()) + } + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + columns[columnIndex].writeValues(rows, levels) + return nil + } +} + +func writeRowsFuncOfOptional(t reflect.Type, schema *Schema, path columnPath, writeRows writeRowsFunc) writeRowsFunc { + if t.Kind() == reflect.Slice && t.Elem().Kind() != reflect.Uint8 { // assume nested list; []byte is scalar + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writeRows(columns, rows, levels) + } + levels.definitionLevel++ + return writeRows(columns, rows, levels) + } + } + nullIndex := nullIndexFuncOf(t) + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writeRows(columns, rows, levels) + } + + nulls := acquireBitmap(rows.Len()) + defer releaseBitmap(nulls) + nullIndex(nulls.bits, rows) + + nullLevels := levels + levels.definitionLevel++ + // In this function, we are dealing with optional values which are + // neither pointers nor slices; for example, a int32 field marked + // "optional" in its parent struct. + // + // We need to find zero values, which should be represented as nulls + // in the parquet column. In order to minimize the calls to writeRows + // and maximize throughput, we use the nullIndex and nonNullIndex + // functions, which are type-specific implementations of the algorithm. + // + // Sections of the input that are contiguous nulls or non-nulls can be + // sent to a single call to writeRows to be written to the underlying + // buffer since they share the same definition level. + // + // This optimization is defeated by inputs alternating null and non-null + // sequences of single values, we do not expect this condition to be a + // common case. + for i := 0; i < rows.Len(); { + j := 0 + x := i / 64 + y := i % 64 + + if y != 0 { + if b := nulls.bits[x] >> uint(y); b == 0 { + x++ + y = 0 + } else { + y += bits.TrailingZeros64(b) + goto writeNulls + } + } + + for x < len(nulls.bits) && nulls.bits[x] == 0 { + x++ + } + + if x < len(nulls.bits) { + y = bits.TrailingZeros64(nulls.bits[x]) % 64 + } + + writeNulls: + if j = x*64 + y; j > rows.Len() { + j = rows.Len() + } + + if i < j { + if err := writeRows(columns, rows.Slice(i, j), nullLevels); err != nil { + return err + } + i = j + } + + if y != 0 { + if b := nulls.bits[x] >> uint(y); b == (1< rows.Len() { + j = rows.Len() + } + + if i < j { + if err := writeRows(columns, rows.Slice(i, j), levels); err != nil { + return err + } + i = j + } + } + + return nil + } +} + +func writeRowsFuncOfArray(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { + column := schema.lazyLoadState().mapping.lookup(path) + arrayLen := t.Len() + columnLen := column.node.Type().Length() + if arrayLen != columnLen { + panic(fmt.Sprintf("cannot convert Go values of type "+typeNameOf(t)+" to FIXED_LEN_BYTE_ARRAY(%d)", columnLen)) + } + return writeRowsFuncOfRequired(t, schema, path) +} + +func writeRowsFuncOfPointer(t reflect.Type, schema *Schema, path columnPath, tagReplacements []StructTagOption) writeRowsFunc { + elemType := t.Elem() + elemSize := uintptr(elemType.Size()) + writeRows := writeRowsFuncOf(elemType, schema, path, tagReplacements) + + if len(path) == 0 { + // This code path is taken when generating a writeRowsFunc for a pointer + // type. In this case, we do not need to increase the definition level + // since we are not deailng with an optional field but a pointer to the + // row type. + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writeRows(columns, rows, levels) + } + + for i := range rows.Len() { + p := *(*unsafe.Pointer)(rows.Index(i)) + a := sparse.Array{} + if p != nil { + a = makeArray(p, 1, elemSize) + } + if err := writeRows(columns, a, levels); err != nil { + return err + } + } + + return nil + } + } + + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writeRows(columns, rows, levels) + } + + for i := range rows.Len() { + p := *(*unsafe.Pointer)(rows.Index(i)) + a := sparse.Array{} + elemLevels := levels + if p != nil { + a = makeArray(p, 1, elemSize) + elemLevels.definitionLevel++ + } + if err := writeRows(columns, a, elemLevels); err != nil { + return err + } + } + + return nil + } +} + +func writeRowsFuncOfSlice(t reflect.Type, schema *Schema, path columnPath, tagReplacements []StructTagOption) writeRowsFunc { + elemType := t.Elem() + elemSize := uintptr(elemType.Size()) + writeRows := writeRowsFuncOf(elemType, schema, path, tagReplacements) + + // When the element is a pointer type, the writeRows function will be an + // instance returned by writeRowsFuncOfPointer, which handles incrementing + // the definition level if the pointer value is not nil. + definitionLevelIncrement := byte(0) + if elemType.Kind() != reflect.Ptr { + definitionLevelIncrement = 1 + } + + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writeRows(columns, rows, levels) + } + + levels.repetitionDepth++ + + for i := range rows.Len() { + p := (*sliceHeader)(rows.Index(i)) + a := makeArray(p.base, p.len, elemSize) + b := sparse.Array{} + + elemLevels := levels + if a.Len() > 0 { + b = a.Slice(0, 1) + elemLevels.definitionLevel += definitionLevelIncrement + } + + if err := writeRows(columns, b, elemLevels); err != nil { + return err + } + + if a.Len() > 1 { + elemLevels.repetitionLevel = elemLevels.repetitionDepth + + if err := writeRows(columns, a.Slice(1, a.Len()), elemLevels); err != nil { + return err + } + } + } + + return nil + } +} + +func writeRowsFuncOfStruct(t reflect.Type, schema *Schema, path columnPath, tagReplacements []StructTagOption) writeRowsFunc { + type column struct { + offset uintptr + writeRows writeRowsFunc + } + + fields := structFieldsOf(path, t, tagReplacements) + columns := make([]column, len(fields)) + + for i, f := range fields { + list, optional := false, false + columnPath := path.append(f.Name) + forEachStructTagOption(f, func(_ reflect.Type, option, _ string) { + switch option { + case "list": + list = true + columnPath = columnPath.append("list", "element") + case "optional": + optional = true + } + }) + + writeRows := writeRowsFuncOf(f.Type, schema, columnPath, tagReplacements) + if optional { + kind := f.Type.Kind() + switch { + case kind == reflect.Pointer: + case kind == reflect.Slice && !list && f.Type.Elem().Kind() != reflect.Uint8: + // For slices other than []byte, optional applies to the element, not the list + case f.Type == reflect.TypeOf(time.Time{}): + // time.Time is a struct but has IsZero() method, so it needs special handling + // Don't use writeRowsFuncOfOptional which relies on bitmap batching + default: + writeRows = writeRowsFuncOfOptional(f.Type, schema, columnPath, writeRows) + } + } + + columns[i] = column{ + offset: f.Offset, + writeRows: writeRows, + } + } + + return func(buffers []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + for _, column := range columns { + if err := column.writeRows(buffers, rows, levels); err != nil { + return err + } + } + } else { + for _, column := range columns { + if err := column.writeRows(buffers, rows.Offset(column.offset), levels); err != nil { + return err + } + } + } + return nil + } +} + +var ( + mapStringStringType = reflect.TypeOf((map[string]string)(nil)) + mapStringAnyType = reflect.TypeOf((map[string]any)(nil)) +) + +// writeRowsFuncOfMapToGroup handles writing a Go map to a Parquet GROUP schema +// (as opposed to a MAP logical type). This allows map[string]T to be written +// to schemas with named optional fields. +func writeRowsFuncOfMapToGroup(t reflect.Type, schema *Schema, path columnPath, groupNode Node, tagReplacements []StructTagOption) writeRowsFunc { + if t.Key().Kind() != reflect.String { + panic("map keys must be strings when writing to GROUP schema") + } + + type fieldWriter struct { + fieldName string + fieldPath columnPath + writeRows writeRowsFunc // Writes null/empty value + writeValue func([]ColumnBuffer, reflect.Value, columnLevels) error + } + + // Get all fields from the GROUP and create write functions for each + fields := groupNode.Fields() + writers := make([]fieldWriter, len(fields)) + valueType := t.Elem() + valueSize := uintptr(valueType.Size()) + + // Check if the value type is interface{} - if so, we need runtime type handling + // We split into two separate loops to avoid branching inside the loop + if valueType.Kind() == reflect.Interface { + // Interface{} path - need runtime type handling + for i, field := range fields { + fieldPath := path.append(field.Name()) + fieldNode := findByPath(schema, fieldPath) + + // For interface{} types, create a write function based on the SCHEMA type + // This will be used when writing null values + writeNull := writeRowsFuncOfSchemaNode(fieldNode, schema, fieldPath, field) + + // Capture variables for the closure + writeValue := func(columns []ColumnBuffer, mapValue reflect.Value, levels columnLevels) error { + actualValue := mapValue + actualValueKind := actualValue.Kind() + if actualValueKind == reflect.Interface && !actualValue.IsNil() { + actualValue = actualValue.Elem() + actualValueKind = actualValue.Kind() + } + if !actualValue.IsValid() || (actualValueKind == reflect.Pointer && actualValue.IsNil()) { + // Nil interface or nil pointer - write null + return writeNull(columns, sparse.Array{}, levels) + } + if actualValueKind == reflect.Pointer { + actualValue = actualValue.Elem() + } + return writeInterfaceValue(columns, actualValue, field, schema, fieldPath, levels, tagReplacements) + } + + writers[i] = fieldWriter{ + fieldName: field.Name(), + fieldPath: fieldPath, + writeRows: writeNull, + writeValue: writeValue, + } + } + } else { + // Concrete type path - can pre-create write functions + for i, field := range fields { + fieldPath := path.append(field.Name()) + + // For concrete types, we can pre-create the write function + writeRows := writeRowsFuncOf(valueType, schema, fieldPath, tagReplacements) + + // Check if the field is optional + if field.Optional() { + writeRows = writeRowsFuncOfOptional(valueType, schema, fieldPath, writeRows) + } + + // Both null and value use the same function for concrete types + writeValue := func(columns []ColumnBuffer, mapValue reflect.Value, levels columnLevels) error { + valueArray := makeArray(reflectValuePointer(mapValue), 1, valueSize) + return writeRows(columns, valueArray, levels) + } + + writers[i] = fieldWriter{ + fieldName: field.Name(), + fieldPath: fieldPath, + writeRows: writeRows, + writeValue: writeValue, + } + } + } + + // We make sepcial cases for the common types to avoid paying the cost of + // reflection in calls like MapIndex which force the returned value to be + // allocated on the heap. + var writeMaps writeRowsFunc + switch { + case t.ConvertibleTo(mapStringStringType): + writeMaps = func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + buffer, _ := stringArrayPool.Get().(*stringArray) + if buffer == nil { + buffer = new(stringArray) + } + numRows := rows.Len() + numValues := len(writers) * numRows + buffer.values = slices.Grow(buffer.values, numValues)[:numValues] + defer stringArrayPool.Put(buffer) + + for i := range numRows { + m := *(*map[string]string)(reflect.NewAt(t, rows.Index(i)).UnsafePointer()) + + for j := range writers { + buffer.values[j*numRows+i] = m[writers[j].fieldName] + } + } + + for j := range writers { + a := sparse.MakeStringArray(buffer.values[j*numRows : (j+1)*numRows]) + if err := writers[j].writeRows(columns, a.UnsafeArray(), levels); err != nil { + return err + } + } + + return nil + } + + case t.ConvertibleTo(mapStringAnyType): + writeMaps = func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + for i := range rows.Len() { + m := *(*map[string]any)(reflect.NewAt(t, rows.Index(i)).UnsafePointer()) + + for j := range writers { + w := &writers[j] + v, ok := m[w.fieldName] + + var err error + if !ok { + err = w.writeRows(columns, sparse.Array{}, levels) + } else { + err = w.writeValue(columns, reflect.ValueOf(v), levels) + } + if err != nil { + return err + } + } + } + return nil + } + + default: + writeMaps = func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + for i := range rows.Len() { + m := reflect.NewAt(t, rows.Index(i)).Elem() + + for j := range writers { + w := &writers[j] + keyValue := reflect.ValueOf(&w.fieldName).Elem() + mapValue := m.MapIndex(keyValue) + + var err error + if !mapValue.IsValid() { + err = w.writeRows(columns, sparse.Array{}, levels) + } else { + err = w.writeValue(columns, mapValue, levels) + } + if err != nil { + return err + } + } + } + return nil + } + } + + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + // Write empty values for all fields + for _, w := range writers { + if err := w.writeRows(columns, sparse.Array{}, levels); err != nil { + return err + } + } + return nil + } + return writeMaps(columns, rows, levels) + } +} + +type stringArray struct{ values []string } + +var stringArrayPool sync.Pool // *stringArray + +// writeRowsFuncOfSchemaNode creates a write function based on the schema node type +// rather than a Go type. This is used for interface{} values where we need to write +// nulls based on the schema structure. +func writeRowsFuncOfSchemaNode(node Node, schema *Schema, path columnPath, field Node) writeRowsFunc { + if node == nil { + panic(fmt.Sprintf("schema node not found at path: %v", path)) + } + + // Check if this is a leaf or a group + if len(node.Fields()) == 0 { + // It's a leaf node - create a simple write function + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + leaf, ok := schema.Lookup(path...) + if !ok { + return fmt.Errorf("leaf not found: %v", path) + } + + // For optional fields with no data, we need to write at the parent's definition level + // For non-optional or when there's data, increment the definition level + if rows.Len() == 0 && field.Optional() { + // Write null - don't increment definition level + columns[leaf.ColumnIndex].writeValues(rows, levels) + } else if field.Optional() { + // Write value - increment definition level + levels.definitionLevel++ + columns[leaf.ColumnIndex].writeValues(rows, levels) + levels.definitionLevel-- + } else { + // Required field + columns[leaf.ColumnIndex].writeValues(rows, levels) + } + return nil + } + } + + // It's a group - recursively create write functions for all children + type childWriter struct { + writeRows writeRowsFunc + } + + fields := node.Fields() + children := make([]childWriter, len(fields)) + + for i, childField := range fields { + childPath := path.append(childField.Name()) + childNode := findByPath(schema, childPath) + children[i] = childWriter{ + writeRows: writeRowsFuncOfSchemaNode(childNode, schema, childPath, childField), + } + } + + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + // For groups, we need to write to all child columns + for _, child := range children { + if err := child.writeRows(columns, rows, levels); err != nil { + return err + } + } + return nil + } +} + +// writeInterfaceValue writes an interface{} value at runtime, determining the appropriate +// write function based on the actual type. +func writeInterfaceValue(columns []ColumnBuffer, value reflect.Value, field Node, schema *Schema, path columnPath, levels columnLevels, tagReplacements []StructTagOption) error { + actualType := value.Type() + schemaCache := schema.lazyLoadCache() + + hash := maphash.Hash{} + hash.SetSeed(schemaCache.hashSeed) + + for _, name := range path { + hash.WriteString(name) + hash.WriteByte(0) + } + + writeRowsKey := writeRowsCacheKey{ + gotype: actualType, + column: hash.Sum64(), + } + + writeRows := schemaCache.writeRows.load(writeRowsKey, func() writeRowsFunc { + return writeRowsFuncOf(actualType, schema, path, tagReplacements) + }) + + // Handle optional fields + if field.Optional() { + // For optional fields with actual values, we need to increment definition level + levels.definitionLevel++ + defer func() { levels.definitionLevel-- }() + } + + valueArray := makeArray(reflectValuePointer(value), 1, actualType.Size()) + return writeRows(columns, valueArray, levels) +} + +func writeRowsFuncOfMap(t reflect.Type, schema *Schema, path columnPath, tagReplacements []StructTagOption) writeRowsFunc { + // Check if the schema at this path is a MAP or a GROUP. + node := findByPath(schema, path) + if node != nil && !isMap(node) { + // The schema is a GROUP (not a MAP), so we need to handle it differently. + // Instead of using key_value structure, we iterate through the GROUP's fields + // and look up corresponding map keys. + return writeRowsFuncOfMapToGroup(t, schema, path, node, tagReplacements) + } + + // Standard MAP logical type handling + keyPath := path.append("key_value", "key") + keyType := t.Key() + writeKeys := writeRowsFuncOf(keyType, schema, keyPath, tagReplacements) + + valuePath := path.append("key_value", "value") + valueType := t.Elem() + writeValues := writeRowsFuncOf(valueType, schema, valuePath, tagReplacements) + + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + if err := writeKeys(columns, rows, levels); err != nil { + return err + } + if err := writeValues(columns, rows, levels); err != nil { + return err + } + return nil + } + + levels.repetitionDepth++ + makeMap := makeMapFuncOf(t) + + for i := range rows.Len() { + m := reflect.NewAt(t, rows.Index(i)).Elem() + n := m.Len() + + if n == 0 { + empty := sparse.Array{} + if err := writeKeys(columns, empty, levels); err != nil { + return err + } + if err := writeValues(columns, empty, levels); err != nil { + return err + } + continue + } + + elemLevels := levels + elemLevels.definitionLevel++ + + keys, values := makeMap(m).entries() + if err := writeKeys(columns, keys.Slice(0, 1), elemLevels); err != nil { + return err + } + if err := writeValues(columns, values.Slice(0, 1), elemLevels); err != nil { + return err + } + if n > 1 { + elemLevels.repetitionLevel = elemLevels.repetitionDepth + if err := writeKeys(columns, keys.Slice(1, n), elemLevels); err != nil { + return err + } + if err := writeValues(columns, values.Slice(1, n), elemLevels); err != nil { + return err + } + } + } + + return nil + } +} + +func writeRowsFuncOfJSON(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { + // If this is a string or a byte array write directly. + switch t.Kind() { + case reflect.String: + return writeRowsFuncOfRequired(t, schema, path) + case reflect.Slice: + if t.Elem().Kind() == reflect.Uint8 { + return writeRowsFuncOfRequired(t, schema, path) + } + } + + // Otherwise handle with a json.Marshal + asStrT := reflect.TypeOf(string("")) + writer := writeRowsFuncOfRequired(asStrT, schema, path) + + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writer(columns, rows, levels) + } + for i := range rows.Len() { + val := reflect.NewAt(t, rows.Index(i)) + asI := val.Interface() + + b, err := json.Marshal(asI) + if err != nil { + return err + } + + asStr := string(b) + a := sparse.MakeStringArray([]string{asStr}) + if err := writer(columns, a.UnsafeArray(), levels); err != nil { + return err + } + } + return nil + } +} + +func writeRowsFuncOfTime(_ reflect.Type, schema *Schema, path columnPath, tagReplacements []StructTagOption) writeRowsFunc { + t := reflect.TypeOf(int64(0)) + elemSize := uintptr(t.Size()) + writeRows := writeRowsFuncOf(t, schema, path, tagReplacements) + + col, _ := schema.Lookup(path...) + unit := Nanosecond.TimeUnit() + lt := col.Node.Type().LogicalType() + if lt != nil && lt.Timestamp != nil { + unit = lt.Timestamp.Unit + } + + // Check if the column is optional + isOptional := col.Node.Optional() + + return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { + if rows.Len() == 0 { + return writeRows(columns, rows, levels) + } + + // If we're optional and the current definition level is already > 0, + // then we're in a pointer/nested context where writeRowsFuncOfPointer already handles optionality. + // Don't double-handle it here. For simple optional fields, definitionLevel starts at 0. + alreadyHandled := isOptional && levels.definitionLevel > 0 + + times := rows.TimeArray() + for i := range times.Len() { + t := times.Index(i) + + // For optional fields, check if the value is zero (unless already handled by pointer wrapper) + elemLevels := levels + if isOptional && !alreadyHandled && t.IsZero() { + // Write as NULL (don't increment definition level) + empty := sparse.Array{} + if err := writeRows(columns, empty, elemLevels); err != nil { + return err + } + continue + } + + // For optional non-zero values, increment definition level (unless already handled) + if isOptional && !alreadyHandled { + elemLevels.definitionLevel++ + } + + var val int64 + switch { + case unit.Millis != nil: + val = t.UnixMilli() + case unit.Micros != nil: + val = t.UnixMicro() + default: + val = t.UnixNano() + } + + a := makeArray(reflectValueData(reflect.ValueOf(val)), 1, elemSize) + if err := writeRows(columns, a, elemLevels); err != nil { + return err + } + } + + return nil + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/column_chunk.go b/vendor/github.com/parquet-go/parquet-go/column_chunk.go index 56dd3568002..45e8693886b 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_chunk.go +++ b/vendor/github.com/parquet-go/parquet-go/column_chunk.go @@ -78,19 +78,18 @@ type ColumnChunkValueReader interface { // NewColumnChunkValueReader creates a new ColumnChunkValueReader for the given // column chunk. func NewColumnChunkValueReader(column ColumnChunk) ColumnChunkValueReader { - return &columnChunkValueReader{pages: column.Pages(), release: Release} + return &columnChunkValueReader{pages: column.Pages()} } type columnChunkValueReader struct { - pages Pages - page Page - values ValueReader - release func(Page) + pages Pages + page Page + values ValueReader } func (r *columnChunkValueReader) clear() { if r.page != nil { - r.release(r.page) + Release(r.page) r.page = nil r.values = nil } @@ -238,9 +237,13 @@ func readRowsFuncOfRepeated(read readRowsFunc, repetitionDepth byte) readRowsFun func readRowsFuncOfGroup(node Node, columnIndex int, repetitionDepth byte) (int, readRowsFunc) { fields := node.Fields() + // Empty groups (groups with no fields) are valid structural elements + // that don't contain column data. This function shouldn't be called in + // practice since empty groups have no leaf columns to read from. if len(fields) == 0 { - return columnIndex, func(*rowGroupRows, []Row, byte) (int, error) { - return 0, io.EOF + return columnIndex, func(r *rowGroupRows, rows []Row, repetitionLevel byte) (int, error) { + // Return 0 since there are no columns to read + return 0, nil } } diff --git a/vendor/github.com/parquet-go/parquet-go/column_index.go b/vendor/github.com/parquet-go/parquet-go/column_index.go index 81e0c4f6cca..0ddc1842df3 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_index.go +++ b/vendor/github.com/parquet-go/parquet-go/column_index.go @@ -1,10 +1,10 @@ package parquet import ( + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/encoding/plain" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) type ColumnIndex interface { diff --git a/vendor/github.com/parquet-go/parquet-go/column_index_le.go b/vendor/github.com/parquet-go/parquet-go/column_index_le.go index 6b6ac8f30b3..ff842efbb90 100644 --- a/vendor/github.com/parquet-go/parquet-go/column_index_le.go +++ b/vendor/github.com/parquet-go/parquet-go/column_index_le.go @@ -5,8 +5,8 @@ package parquet import ( + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/deprecated" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) func columnIndexInt32Values(values []int32) []byte { diff --git a/vendor/github.com/parquet-go/parquet-go/config.go b/vendor/github.com/parquet-go/parquet-go/config.go index 2a6e0bf5f6c..0c333241aa0 100644 --- a/vendor/github.com/parquet-go/parquet-go/config.go +++ b/vendor/github.com/parquet-go/parquet-go/config.go @@ -4,6 +4,7 @@ import ( "fmt" "maps" "math" + "reflect" "runtime/debug" "slices" "strings" @@ -160,13 +161,16 @@ func (c *FileConfig) Validate() error { // // ... // }) type ReaderConfig struct { - Schema *Schema + Schema *Schema + SchemaConfig *SchemaConfig } // DefaultReaderConfig returns a new ReaderConfig value initialized with the // default reader configuration. func DefaultReaderConfig() *ReaderConfig { - return &ReaderConfig{} + return &ReaderConfig{ + SchemaConfig: DefaultSchemaConfig(), + } } // NewReaderConfig constructs a new reader configuration applying the options @@ -190,7 +194,8 @@ func (c *ReaderConfig) Apply(options ...ReaderOption) { // ConfigureReader applies configuration options from c to config. func (c *ReaderConfig) ConfigureReader(config *ReaderConfig) { *config = ReaderConfig{ - Schema: coalesceSchema(c.Schema, config.Schema), + Schema: coalesceSchema(c.Schema, config.Schema), + SchemaConfig: coalesceSchemaConfig(c.SchemaConfig, config.SchemaConfig), } } @@ -223,6 +228,8 @@ type WriterConfig struct { Sorting SortingConfig SkipPageBounds [][]string Encodings map[Kind]encoding.Encoding + DictionaryMaxBytes int64 + SchemaConfig *SchemaConfig } // DefaultWriterConfig returns a new WriterConfig value initialized with the @@ -237,6 +244,7 @@ func DefaultWriterConfig() *WriterConfig { DataPageVersion: DefaultDataPageVersion, DataPageStatistics: DefaultDataPageStatistics, MaxRowsPerRowGroup: DefaultMaxRowsPerRowGroup, + SchemaConfig: DefaultSchemaConfig(), Sorting: SortingConfig{ SortingBuffers: &defaultSortingBufferPool, }, @@ -295,6 +303,7 @@ func (c *WriterConfig) ConfigureWriter(config *WriterConfig) { Sorting: coalesceSortingConfig(c.Sorting, config.Sorting), SkipPageBounds: coalesceSkipPageBounds(c.SkipPageBounds, config.SkipPageBounds), Encodings: encodings, + SchemaConfig: coalesceSchemaConfig(c.SchemaConfig, config.SchemaConfig), } } @@ -422,6 +431,16 @@ func (c *SortingConfig) ConfigureSorting(config *SortingConfig) { *config = coalesceSortingConfig(*c, *config) } +// SchemaOption is an interface implemented by types that carry configuration +// options for parquet schemas. SchemaOption also implements ReaderOption and WriterOption +// and may be used to configure the way NewGenericReader and NewGenericWriter derive schemas from the arguments. +type SchemaOption interface { + ReaderOption + WriterOption + + ConfigureSchema(*SchemaConfig) +} + // FileOption is an interface implemented by types that carry configuration // options for parquet files. type FileOption interface { @@ -705,6 +724,19 @@ func DefaultEncoding(enc encoding.Encoding) WriterOption { }) } +// DictionaryMaxBytes creates a configuration option which sets the maximum +// size in bytes for each column's dictionary. +// +// When a column's dictionary exceeds this limit, that column will switch from +// dictionary encoding to PLAIN encoding for the remainder of the row group. +// Pages written before the limit was reached remain dictionary-encoded, while +// subsequent pages use PLAIN encoding. +// +// A value of 0 (the default) means unlimited dictionary size. +func DictionaryMaxBytes(size int64) WriterOption { + return writerOption(func(config *WriterConfig) { config.DictionaryMaxBytes = size }) +} + // ColumnBufferCapacity creates a configuration option which defines the size of // row group column buffers. // @@ -754,6 +786,71 @@ func DropDuplicatedRows(drop bool) SortingOption { return sortingOption(func(config *SortingConfig) { config.DropDuplicatedRows = drop }) } +// The SchemaConfig type carries configuration options for parquet schemas. +// +// SchemaConfig implements the SchemaOption interface so it can be used directly +// as argument to the SchemaOf function when needed, for example: +// +// schema := parquet.SchemaOf(obj, &parquet.SchemaConfig{ +// ... +// }) +type SchemaConfig struct { + StructTags []StructTagOption +} + +func (c *SchemaConfig) ConfigureSchema(config *SchemaConfig) { + config.StructTags = coalesceStructTags(c.StructTags, config.StructTags) +} + +func (c *SchemaConfig) ConfigureReader(config *ReaderConfig) { + c.ConfigureSchema(config.SchemaConfig) +} + +func (c *SchemaConfig) ConfigureWriter(config *WriterConfig) { + c.ConfigureSchema(config.SchemaConfig) +} + +func DefaultSchemaConfig() *SchemaConfig { + return &SchemaConfig{} +} + +// StructTagOption performs runtime replacement of "parquet..." struct tags. This +// option can be used anywhere a schema is derived from a Go struct including +// SchemaOf, NewGenericReader, and NewGenericWriter. +type StructTagOption struct { + ColumnPath []string + StructTag reflect.StructTag +} + +var ( + _ SchemaOption = (*StructTagOption)(nil) + _ ReaderOption = (*StructTagOption)(nil) + _ WriterOption = (*StructTagOption)(nil) +) + +// StructTag performs runtime replacement of struct tags when deriving a schema from +// a Go struct for the column at the given path. This option can be used anywhere a schema is +// derived from a Go struct including SchemaOf, NewGenericReader, and NewGenericWriter. +// +// This option is additive, it may be used multiple times to affect multiple columns. +// +// When renaming a column, configure the option by its original name. +func StructTag(tag reflect.StructTag, path ...string) SchemaOption { + return &StructTagOption{StructTag: tag, ColumnPath: path} +} + +func (f *StructTagOption) ConfigureSchema(config *SchemaConfig) { + config.StructTags = append(config.StructTags, *f) +} + +func (f *StructTagOption) ConfigureWriter(config *WriterConfig) { + f.ConfigureSchema(config.SchemaConfig) +} + +func (f *StructTagOption) ConfigureReader(config *ReaderConfig) { + f.ConfigureSchema(config.SchemaConfig) +} + type fileOption func(*FileConfig) func (opt fileOption) ConfigureFile(config *FileConfig) { opt(config) } @@ -856,6 +953,20 @@ func coalesceCompression(c1, c2 compress.Codec) compress.Codec { return c2 } +func coalesceSchemaConfig(f1, f2 *SchemaConfig) *SchemaConfig { + if f1 != nil { + return f1 + } + return f2 +} + +func coalesceStructTags(s1, s2 []StructTagOption) []StructTagOption { + if len(s1) > 0 { + return s1 + } + return s2 +} + func validatePositiveInt(optionName string, optionValue int) error { if optionValue > 0 { return nil @@ -930,4 +1041,5 @@ var ( _ WriterOption = (*WriterConfig)(nil) _ RowGroupOption = (*RowGroupConfig)(nil) _ SortingOption = (*SortingConfig)(nil) + _ SchemaOption = (*SchemaConfig)(nil) ) diff --git a/vendor/github.com/parquet-go/parquet-go/convert.go b/vendor/github.com/parquet-go/parquet-go/convert.go index fcb31b9d3f8..5ef6afcc2b9 100644 --- a/vendor/github.com/parquet-go/parquet-go/convert.go +++ b/vendor/github.com/parquet-go/parquet-go/convert.go @@ -13,10 +13,10 @@ import ( "golang.org/x/sys/cpu" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) // ConvertError is an error type returned by calls to Convert when the conversion @@ -383,8 +383,13 @@ func ConvertRowGroup(rowGroup RowGroup, conv Conversion) RowGroup { numValues: numRows, numNulls: numRows, } - } else { + } else if i == int16(j) { columns[i] = rowGroupColumns[j] + } else { + columns[i] = &convertedColumnChunk{ + chunk: rowGroupColumns[j], + targetColumnIndex: ^int16(i), + } } }) @@ -601,6 +606,163 @@ func (c *convertedRows) SeekToRow(rowIndex int64) error { return c.rows.SeekToRow(rowIndex) } +// convertedColumnChunk wraps a ColumnChunk to fix the column index after reordering. +// When ConvertRowGroup reorders columns, the underlying chunk's Column() method +// returns the original position. This wrapper fixes both Column() and the +// columnIndex in values read from the chunk. +type convertedColumnChunk struct { + chunk ColumnChunk + targetColumnIndex int16 // XOR-encoded column index (^int16(columnIndex)) +} + +func (c *convertedColumnChunk) Type() Type { + return c.chunk.Type() +} + +func (c *convertedColumnChunk) Column() int { + return int(^c.targetColumnIndex) +} + +func (c *convertedColumnChunk) NumValues() int64 { + return c.chunk.NumValues() +} + +func (c *convertedColumnChunk) Pages() Pages { + return &convertedPages{ + pages: c.chunk.Pages(), + targetColumnIndex: c.targetColumnIndex, + } +} + +func (c *convertedColumnChunk) ColumnIndex() (ColumnIndex, error) { + return c.chunk.ColumnIndex() +} + +func (c *convertedColumnChunk) OffsetIndex() (OffsetIndex, error) { + return c.chunk.OffsetIndex() +} + +func (c *convertedColumnChunk) BloomFilter() BloomFilter { + return c.chunk.BloomFilter() +} + +// convertedPages wraps Pages to return convertedPage instances. +type convertedPages struct { + pages Pages + targetColumnIndex int16 +} + +func (p *convertedPages) ReadPage() (Page, error) { + page, err := p.pages.ReadPage() + if err != nil { + return nil, err + } + return &convertedPage{ + page: page, + targetColumnIndex: p.targetColumnIndex, + }, nil +} + +func (p *convertedPages) SeekToRow(rowIndex int64) error { + return p.pages.SeekToRow(rowIndex) +} + +func (p *convertedPages) Close() error { + return p.pages.Close() +} + +// convertedPage wraps a Page to return a convertedValueReader. +type convertedPage struct { + page Page + targetColumnIndex int16 +} + +func (p *convertedPage) Type() Type { + return p.page.Type() +} + +func (p *convertedPage) Column() int { + return int(^p.targetColumnIndex) +} + +func (p *convertedPage) Dictionary() Dictionary { + return p.page.Dictionary() +} + +func (p *convertedPage) NumRows() int64 { + return p.page.NumRows() +} + +func (p *convertedPage) NumValues() int64 { + return p.page.NumValues() +} + +func (p *convertedPage) NumNulls() int64 { + return p.page.NumNulls() +} + +func (p *convertedPage) Bounds() (min, max Value, ok bool) { + return p.page.Bounds() +} + +func (p *convertedPage) Size() int64 { + return p.page.Size() +} + +func (p *convertedPage) RepetitionLevels() []byte { + return p.page.RepetitionLevels() +} + +func (p *convertedPage) DefinitionLevels() []byte { + return p.page.DefinitionLevels() +} + +func (p *convertedPage) Data() encoding.Values { + return p.page.Data() +} + +func (p *convertedPage) Values() ValueReader { + return &convertedValueReader{ + reader: p.page.Values(), + targetColumnIndex: p.targetColumnIndex, + } +} + +func (p *convertedPage) Slice(i, j int64) Page { + return &convertedPage{ + page: p.page.Slice(i, j), + targetColumnIndex: p.targetColumnIndex, + } +} + +func (p *convertedPage) Retain() { + Retain(p.page) +} + +func (p *convertedPage) Release() { + Release(p.page) +} + +var ( + _ retainable = (*convertedPage)(nil) + _ releasable = (*convertedPage)(nil) +) + +// convertedValueReader wraps a ValueReader to rewrite columnIndex in values. +type convertedValueReader struct { + reader ValueReader + targetColumnIndex int16 +} + +func (r *convertedValueReader) ReadValues(values []Value) (int, error) { + n, err := r.reader.ReadValues(values) + // Rewrite columnIndex for all values to match the target column position + for i := range n { + values[i].columnIndex = r.targetColumnIndex + } + return n, err +} + var ( trueBytes = []byte(`true`) falseBytes = []byte(`false`) diff --git a/vendor/github.com/parquet-go/parquet-go/dictionary.go b/vendor/github.com/parquet-go/parquet-go/dictionary.go index d6e6c5933af..91b787d9268 100644 --- a/vendor/github.com/parquet-go/parquet-go/dictionary.go +++ b/vendor/github.com/parquet-go/parquet-go/dictionary.go @@ -5,12 +5,12 @@ import ( "math/bits" "unsafe" + "github.com/parquet-go/bitpack" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/encoding/plain" "github.com/parquet-go/parquet-go/hashprobe" - "github.com/parquet-go/parquet-go/internal/bitpack" - "github.com/parquet-go/parquet-go/internal/unsafecast" "github.com/parquet-go/parquet-go/sparse" "slices" ) @@ -52,6 +52,10 @@ type Dictionary interface { // Returns the number of value indexed in the dictionary. Len() int + // Returns the total size in bytes of all values stored in the dictionary. + // This is used for tracking dictionary memory usage and enforcing size limits. + Size() int64 + // Returns the dictionary value at the given index. Index(index int32) Value @@ -136,6 +140,8 @@ func (d *booleanDictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *booleanDictionary) Len() int { return int(d.numValues) } +func (d *booleanDictionary) Size() int64 { return int64(len(d.bits)) } + func (d *booleanDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *booleanDictionary) index(i int32) bool { return d.valueAt(int(i)) } @@ -233,6 +239,8 @@ func (d *int32Dictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *int32Dictionary) Len() int { return len(d.values) } +func (d *int32Dictionary) Size() int64 { return int64(len(d.values) * 4) } + func (d *int32Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *int32Dictionary) index(i int32) int32 { return d.values[i] } @@ -332,6 +340,8 @@ func (d *int64Dictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *int64Dictionary) Len() int { return len(d.values) } +func (d *int64Dictionary) Size() int64 { return int64(len(d.values) * 8) } + func (d *int64Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *int64Dictionary) index(i int32) int64 { return d.values[i] } @@ -418,6 +428,8 @@ func (d *int96Dictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *int96Dictionary) Len() int { return len(d.values) } +func (d *int96Dictionary) Size() int64 { return int64(len(d.values) * 12) } + func (d *int96Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *int96Dictionary) index(i int32) deprecated.Int96 { return d.values[i] } @@ -513,6 +525,8 @@ func (d *floatDictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *floatDictionary) Len() int { return len(d.values) } +func (d *floatDictionary) Size() int64 { return int64(len(d.values) * 4) } + func (d *floatDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *floatDictionary) index(i int32) float32 { return d.values[i] } @@ -599,6 +613,8 @@ func (d *doubleDictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *doubleDictionary) Len() int { return len(d.values) } +func (d *doubleDictionary) Size() int64 { return int64(len(d.values) * 8) } + func (d *doubleDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *doubleDictionary) index(i int32) float64 { return d.values[i] } @@ -699,6 +715,8 @@ func (d *byteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *byteArrayDictionary) Len() int { return d.len() } +func (d *byteArrayDictionary) Size() int64 { return int64(len(d.values)) } + func (d *byteArrayDictionary) Index(i int32) Value { return d.makeValueBytes(d.index(int(i))) } func (d *byteArrayDictionary) Insert(indexes []int32, values []Value) { @@ -808,6 +826,8 @@ func (d *fixedLenByteArrayDictionary) Type() Type { return newIndexedType(d.typ, func (d *fixedLenByteArrayDictionary) Len() int { return len(d.data) / d.size } +func (d *fixedLenByteArrayDictionary) Size() int64 { return int64(len(d.data)) } + func (d *fixedLenByteArrayDictionary) Index(i int32) Value { return d.makeValueBytes(d.index(i)) } @@ -921,6 +941,8 @@ func (d *uint32Dictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *uint32Dictionary) Len() int { return len(d.values) } +func (d *uint32Dictionary) Size() int64 { return int64(len(d.values) * 4) } + func (d *uint32Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *uint32Dictionary) index(i int32) uint32 { return d.values[i] } @@ -1007,6 +1029,8 @@ func (d *uint64Dictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *uint64Dictionary) Len() int { return len(d.values) } +func (d *uint64Dictionary) Size() int64 { return int64(len(d.values) * 8) } + func (d *uint64Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *uint64Dictionary) index(i int32) uint64 { return d.values[i] } @@ -1093,6 +1117,8 @@ func (d *be128Dictionary) Type() Type { return newIndexedType(d.typ, d) } func (d *be128Dictionary) Len() int { return len(d.values) } +func (d *be128Dictionary) Size() int64 { return int64(len(d.values) * 16) } + func (d *be128Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) } func (d *be128Dictionary) index(i int32) *[16]byte { return &d.values[i] } @@ -1497,6 +1523,8 @@ func (d *nullDictionary) Type() Type { return d.nullPage.Type() } func (d *nullDictionary) Len() int { return int(d.nullPage.count) } +func (d *nullDictionary) Size() int64 { return 0 } + func (d *nullDictionary) Index(i int32) Value { return NullValue() } func (d *nullDictionary) Lookup(indexes []int32, values []Value) { diff --git a/vendor/github.com/parquet-go/parquet-go/dictionary_amd64.go b/vendor/github.com/parquet-go/parquet-go/dictionary_amd64.go index 8d0dedcad4b..03c98e784a4 100644 --- a/vendor/github.com/parquet-go/parquet-go/dictionary_amd64.go +++ b/vendor/github.com/parquet-go/parquet-go/dictionary_amd64.go @@ -5,7 +5,7 @@ package parquet import ( "unsafe" - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/sparse" ) diff --git a/vendor/github.com/parquet-go/parquet-go/dictionary_purego.go b/vendor/github.com/parquet-go/parquet-go/dictionary_purego.go index 4893415250f..cab310afc7f 100644 --- a/vendor/github.com/parquet-go/parquet-go/dictionary_purego.go +++ b/vendor/github.com/parquet-go/parquet-go/dictionary_purego.go @@ -5,7 +5,7 @@ package parquet import ( "unsafe" - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/sparse" ) diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/bytestreamsplit/bytestreamsplit.go b/vendor/github.com/parquet-go/parquet-go/encoding/bytestreamsplit/bytestreamsplit.go index 23b0202d7b0..de78dfe2e34 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/bytestreamsplit/bytestreamsplit.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/bytestreamsplit/bytestreamsplit.go @@ -1,9 +1,9 @@ package bytestreamsplit import ( + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) // This encoder implements a version of the Byte Stream Split encoding as described diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/bytestreamsplit/bytestreamsplit_purego.go b/vendor/github.com/parquet-go/parquet-go/encoding/bytestreamsplit/bytestreamsplit_purego.go index 6f5bf15c795..1007e3c7683 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/bytestreamsplit/bytestreamsplit_purego.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/bytestreamsplit/bytestreamsplit_purego.go @@ -2,7 +2,7 @@ package bytestreamsplit -import "github.com/parquet-go/parquet-go/internal/unsafecast" +import "github.com/parquet-go/bitpack/unsafecast" func encodeFloat(dst, src []byte) { n := len(src) / 4 diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/delta/binary_packed.go b/vendor/github.com/parquet-go/parquet-go/encoding/delta/binary_packed.go index cf9d4cfc9ec..9e6f8c9b289 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/delta/binary_packed.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/delta/binary_packed.go @@ -7,10 +7,10 @@ import ( "math" "math/bits" + "github.com/parquet-go/bitpack" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/bitpack" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) type BinaryPackedEncoding struct { @@ -324,7 +324,7 @@ func decodeInt32(dst, src []byte) ([]byte, []byte, error) { miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)] } miniBlockData = miniBlockData[:miniBlockSize] - bitpack.UnpackInt32(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth)) + bitpack.Unpack(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth)) } writeOffset += n totalValues -= n @@ -387,7 +387,7 @@ func decodeInt64(dst, src []byte) ([]byte, []byte, error) { miniBlockData = miniBlockTemp[:copy(miniBlockTemp, miniBlockData)] } miniBlockData = miniBlockData[:miniBlockSize] - bitpack.UnpackInt64(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth)) + bitpack.Unpack(out[writeOffset:writeOffset+n], miniBlockData, uint(bitWidth)) } writeOffset += n totalValues -= n diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/delta/binary_packed_amd64.go b/vendor/github.com/parquet-go/parquet-go/encoding/delta/binary_packed_amd64.go index 11a5a538b1f..a466e5b9ae2 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/delta/binary_packed_amd64.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/delta/binary_packed_amd64.go @@ -3,7 +3,7 @@ package delta import ( - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" "golang.org/x/sys/cpu" ) diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/delta/delta.go b/vendor/github.com/parquet-go/parquet-go/encoding/delta/delta.go index 3549a3270f9..248ba1fb57c 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/delta/delta.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/delta/delta.go @@ -4,7 +4,7 @@ import ( "fmt" "sync" - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" ) type int32Buffer struct { diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/plain/plain.go b/vendor/github.com/parquet-go/parquet-go/encoding/plain/plain.go index 690bc815555..c16aa2b7af5 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/plain/plain.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/plain/plain.go @@ -9,10 +9,10 @@ import ( "io" "math" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) const ( diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/plain/plain_le.go b/vendor/github.com/parquet-go/parquet-go/encoding/plain/plain_le.go index bd1eadf6a06..97772a1cdcc 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/plain/plain_le.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/plain/plain_le.go @@ -3,8 +3,8 @@ package plain import ( + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/encoding" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) func (e *Encoding) EncodeInt32(dst []byte, src []int32) ([]byte, error) { diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/rle/dictionary.go b/vendor/github.com/parquet-go/parquet-go/encoding/rle/dictionary.go index 8304afc0188..b21fda912ca 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/rle/dictionary.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/rle/dictionary.go @@ -3,9 +3,9 @@ package rle import ( "math/bits" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) type DictionaryEncoding struct { diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/rle/rle.go b/vendor/github.com/parquet-go/parquet-go/encoding/rle/rle.go index 4f17d3c4b88..1ef0e43370c 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/rle/rle.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/rle/rle.go @@ -13,11 +13,11 @@ import ( "golang.org/x/sys/cpu" + "github.com/parquet-go/bitpack" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/encoding" "github.com/parquet-go/parquet-go/format" - "github.com/parquet-go/parquet-go/internal/bitpack" "github.com/parquet-go/parquet-go/internal/bytealg" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) const ( @@ -386,7 +386,7 @@ func decodeInt32(dst, src []byte, bitWidth uint) ([]byte, error) { } out := unsafecast.Slice[int32](dst[offset:]) - bitpack.UnpackInt32(out, in, bitWidth) + bitpack.Unpack(out, in, bitWidth) i += length } else { j := i + bitpack.ByteCount(bitWidth) @@ -517,7 +517,7 @@ func grow(buf []byte, size int) []byte { func encodeInt32BitpackDefault(dst []byte, src [][8]int32, bitWidth uint) int { bits := unsafecast.Slice[int32](src) - bitpack.PackInt32(dst, bits, bitWidth) + bitpack.Pack(dst, bits, bitWidth) return bitpack.ByteCount(uint(len(src)*8) * bitWidth) } diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/thrift/binary.go b/vendor/github.com/parquet-go/parquet-go/encoding/thrift/binary.go index 73f15b03afb..82d7fe610f5 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/thrift/binary.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/thrift/binary.go @@ -8,7 +8,7 @@ import ( "io" "math" - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" ) // BinaryProtocol is a Protocol implementation for the binary thrift protocol. diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/thrift/compact.go b/vendor/github.com/parquet-go/parquet-go/encoding/thrift/compact.go index 7bca5771deb..65d98c2b386 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/thrift/compact.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/thrift/compact.go @@ -8,7 +8,7 @@ import ( "io" "math" - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" ) // CompactProtocol is a Protocol implementation for the compact thrift protocol. diff --git a/vendor/github.com/parquet-go/parquet-go/encoding/values.go b/vendor/github.com/parquet-go/parquet-go/encoding/values.go index 41ab0a23e35..9e82eb7cc0a 100644 --- a/vendor/github.com/parquet-go/parquet-go/encoding/values.go +++ b/vendor/github.com/parquet-go/parquet-go/encoding/values.go @@ -3,8 +3,8 @@ package encoding import ( "fmt" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/deprecated" - "github.com/parquet-go/parquet-go/internal/unsafecast" ) type Kind int32 diff --git a/vendor/github.com/parquet-go/parquet-go/file.go b/vendor/github.com/parquet-go/parquet-go/file.go index 58901415bff..90507af5698 100644 --- a/vendor/github.com/parquet-go/parquet-go/file.go +++ b/vendor/github.com/parquet-go/parquet-go/file.go @@ -764,10 +764,8 @@ func (f *FilePages) init(c *FileColumnChunk, reader io.ReaderAt) { f.rbuf, f.rbufpool = getBufioReader(&f.section, f.bufferSize) f.decoder.Reset(f.protocol.NewReader(f.rbuf)) f.index = 0 - if f.lastPage != nil { - Release(f.lastPage) - f.lastPage = nil - } + Release(f.lastPage) + f.lastPage = nil f.lastPageIndex = -1 f.serveLastPage = false } @@ -944,7 +942,7 @@ func (f *FilePages) readDictionary() error { return f.readDictionaryPage(header, page) } -func (f *FilePages) readDictionaryPage(header *format.PageHeader, page *buffer) error { +func (f *FilePages) readDictionaryPage(header *format.PageHeader, page *buffer[byte]) error { if header.DictionaryPageHeader == nil { return ErrMissingPageHeader } @@ -956,7 +954,7 @@ func (f *FilePages) readDictionaryPage(header *format.PageHeader, page *buffer) return nil } -func (f *FilePages) readDataPageV1(header *format.PageHeader, page *buffer) (Page, error) { +func (f *FilePages) readDataPageV1(header *format.PageHeader, page *buffer[byte]) (Page, error) { if header.DataPageHeader == nil { return nil, ErrMissingPageHeader } @@ -968,7 +966,7 @@ func (f *FilePages) readDataPageV1(header *format.PageHeader, page *buffer) (Pag return f.chunk.column.decodeDataPageV1(DataPageHeaderV1{header.DataPageHeader}, page, f.dictionary, header.UncompressedPageSize) } -func (f *FilePages) readDataPageV2(header *format.PageHeader, page *buffer) (Page, error) { +func (f *FilePages) readDataPageV2(header *format.PageHeader, page *buffer[byte]) (Page, error) { if header.DataPageHeaderV2 == nil { return nil, ErrMissingPageHeader } @@ -983,7 +981,7 @@ func (f *FilePages) readDataPageV2(header *format.PageHeader, page *buffer) (Pag return f.chunk.column.decodeDataPageV2(DataPageHeaderV2{header.DataPageHeaderV2}, page, f.dictionary, header.UncompressedPageSize) } -func (f *FilePages) readPage(header *format.PageHeader, reader *bufio.Reader) (*buffer, error) { +func (f *FilePages) readPage(header *format.PageHeader, reader *bufio.Reader) (*buffer[byte], error) { page := buffers.get(int(header.CompressedPageSize)) defer page.unref() @@ -1098,10 +1096,8 @@ func (f *FilePages) Close() error { f.dataOffset = 0 f.dictOffset = 0 f.index = 0 - if f.lastPage != nil { - Release(f.lastPage) - f.lastPage = nil - } + Release(f.lastPage) + f.lastPage = nil f.lastPageIndex = -1 f.serveLastPage = false f.skip = 0 diff --git a/vendor/github.com/parquet-go/parquet-go/go.tools.mod b/vendor/github.com/parquet-go/parquet-go/go.tools.mod index 7640399bb58..6c560715ebb 100644 --- a/vendor/github.com/parquet-go/parquet-go/go.tools.mod +++ b/vendor/github.com/parquet-go/parquet-go/go.tools.mod @@ -1,8 +1,6 @@ module github.com/parquet-go/parquet-go -go 1.23.4 - -toolchain go1.24.0 +go 1.25.1 tool golang.org/x/tools/gopls/internal/analysis/modernize/cmd/modernize @@ -11,8 +9,9 @@ require ( github.com/google/uuid v1.6.0 github.com/hexops/gotextdiff v1.0.3 github.com/klauspost/compress v1.18.0 + github.com/parquet-go/bitpack v0.0.0-20251026130316-7709569977d0 github.com/pierrec/lz4/v4 v4.1.22 - golang.org/x/sys v0.30.0 + golang.org/x/sys v0.37.0 google.golang.org/protobuf v1.36.5 ) diff --git a/vendor/github.com/parquet-go/parquet-go/go.tools.sum b/vendor/github.com/parquet-go/parquet-go/go.tools.sum index 4a10b34d90d..1636714f240 100644 --- a/vendor/github.com/parquet-go/parquet-go/go.tools.sum +++ b/vendor/github.com/parquet-go/parquet-go/go.tools.sum @@ -8,6 +8,8 @@ github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUq github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/parquet-go/bitpack v0.0.0-20251026130316-7709569977d0 h1:WDJJwDZk8KYIFVkyvQJE7q6UGwNSre4pzw+bpJ0osIY= +github.com/parquet-go/bitpack v0.0.0-20251026130316-7709569977d0/go.mod h1:Ef5FEKAqlyHECov7Z3RKOq8Ud9Vr8cbGNoCF56KEKaE= github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= @@ -16,8 +18,8 @@ golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM= golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/tools v0.30.1-0.20250221230316-5055f70f240c h1:Ja/5gV5a9Vvho3p2NC/T2TtxhHjrWS/2DvCKMvA0a+Y= golang.org/x/tools v0.30.1-0.20250221230316-5055f70f240c/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= golang.org/x/tools/gopls v0.18.1 h1:2xJBNzdImS5u/kV/ZzqDLSvlBSeZX+pWY9uKVP7Pask= diff --git a/vendor/github.com/parquet-go/parquet-go/hashprobe/hashprobe.go b/vendor/github.com/parquet-go/parquet-go/hashprobe/hashprobe.go index 0a1686f17b6..1c81b32856a 100644 --- a/vendor/github.com/parquet-go/parquet-go/hashprobe/hashprobe.go +++ b/vendor/github.com/parquet-go/parquet-go/hashprobe/hashprobe.go @@ -33,9 +33,9 @@ import ( "math/rand" "sync" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/hashprobe/aeshash" "github.com/parquet-go/parquet-go/hashprobe/wyhash" - "github.com/parquet-go/parquet-go/internal/unsafecast" "github.com/parquet-go/parquet-go/sparse" ) diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/pack.go b/vendor/github.com/parquet-go/parquet-go/internal/bitpack/pack.go deleted file mode 100644 index 585202842fc..00000000000 --- a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/pack.go +++ /dev/null @@ -1,83 +0,0 @@ -package bitpack - -import ( - "encoding/binary" -) - -// PackInt32 packs values from src to dst, each value is packed into the given -// bit width regardless of how many bits are needed to represent it. -// -// The function panics if dst is too short to hold the bit packed values. -func PackInt32(dst []byte, src []int32, bitWidth uint) { - assertPack(dst, len(src), bitWidth) - packInt32(dst, src, bitWidth) -} - -func packInt32(dst []byte, src []int32, bitWidth uint) { - n := ByteCount(uint(len(src)) * bitWidth) - b := dst[:n] - - for i := range b { - b[i] = 0 - } - - bitMask := uint32(1<> (32 - j)) - - binary.LittleEndian.PutUint32(dst[(i+0)*4:], lo) - binary.LittleEndian.PutUint32(dst[(i+1)*4:], hi) - - bitOffset += bitWidth - } -} - -// PackInt64 packs values from src to dst, each value is packed into the given -// bit width regardless of how many bits are needed to represent it. -// -// The function panics if dst is too short to hold the bit packed values. -func PackInt64(dst []byte, src []int64, bitWidth uint) { - assertPack(dst, len(src), bitWidth) - packInt64(dst, src, bitWidth) -} - -func packInt64(dst []byte, src []int64, bitWidth uint) { - n := ByteCount(uint(len(src)) * bitWidth) - b := dst[:n] - - for i := range b { - b[i] = 0 - } - - bitMask := uint64(1<> (64 - j)) - - binary.LittleEndian.PutUint64(dst[(i+0)*8:], lo) - binary.LittleEndian.PutUint64(dst[(i+1)*8:], hi) - - bitOffset += bitWidth - } -} - -func assertPack(dst []byte, count int, bitWidth uint) { - _ = dst[:ByteCount(bitWidth*uint(count))] -} diff --git a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack.go b/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack.go deleted file mode 100644 index 752a6f7ff8c..00000000000 --- a/vendor/github.com/parquet-go/parquet-go/internal/bitpack/unpack.go +++ /dev/null @@ -1,27 +0,0 @@ -package bitpack - -// PaddingInt32 is the padding expected to exist after the end of input buffers -// for the UnpackInt32 algorithm to avoid reading beyond the end of the input. -const PaddingInt32 = 16 - -// PaddingInt64 is the padding expected to exist after the end of input buffers -// for the UnpackInt32 algorithm to avoid reading beyond the end of the input. -const PaddingInt64 = 32 - -// UnpackInt32 unpacks 32 bit integers from src to dst. -// -// The function unpacked len(dst) integers, it panics if src is too short to -// contain len(dst) values of the given bit width. -func UnpackInt32(dst []int32, src []byte, bitWidth uint) { - _ = src[:ByteCount(bitWidth*uint(len(dst))+8*PaddingInt32)] - unpackInt32(dst, src, bitWidth) -} - -// UnpackInt64 unpacks 64 bit integers from src to dst. -// -// The function unpacked len(dst) integers, it panics if src is too short to -// contain len(dst) values of the given bit width. -func UnpackInt64(dst []int64, src []byte, bitWidth uint) { - _ = src[:ByteCount(bitWidth*uint(len(dst))+8*PaddingInt64)] - unpackInt64(dst, src, bitWidth) -} diff --git a/vendor/github.com/parquet-go/parquet-go/internal/debug/debug.go b/vendor/github.com/parquet-go/parquet-go/internal/debug/debug.go index 45c12e2a03a..8acc28c4c79 100644 --- a/vendor/github.com/parquet-go/parquet-go/internal/debug/debug.go +++ b/vendor/github.com/parquet-go/parquet-go/internal/debug/debug.go @@ -73,7 +73,7 @@ var ( ) func init() { - for _, arg := range strings.Split(os.Getenv("PARQUETGODEBUG"), ",") { + for arg := range strings.SplitSeq(os.Getenv("PARQUETGODEBUG"), ",") { k := arg v := "" i := strings.IndexByte(arg, '=') diff --git a/vendor/github.com/parquet-go/parquet-go/merge.go b/vendor/github.com/parquet-go/parquet-go/merge.go index 795c878c37f..588ed044dc9 100644 --- a/vendor/github.com/parquet-go/parquet-go/merge.go +++ b/vendor/github.com/parquet-go/parquet-go/merge.go @@ -30,6 +30,9 @@ func MergeRowGroups(rowGroups []RowGroup, options ...RowGroupOption) (RowGroup, schema := config.Schema if len(rowGroups) == 0 { + if schema == nil { + return nil, fmt.Errorf("cannot merge empty row groups without a schema") + } return newEmptyRowGroup(schema), nil } if schema == nil { diff --git a/vendor/github.com/parquet-go/parquet-go/node.go b/vendor/github.com/parquet-go/parquet-go/node.go index 8abf0fa5bd4..d2460c9d568 100644 --- a/vendor/github.com/parquet-go/parquet-go/node.go +++ b/vendor/github.com/parquet-go/parquet-go/node.go @@ -491,6 +491,20 @@ func fieldByName(node Node, name string) Field { return nil } +// findByPath navigates the node tree to find the node at the given path. +// Returns nil if the path doesn't exist. +// The path is a sequence of field names to traverse. +func findByPath(node Node, path []string) Node { + for _, name := range path { + field := fieldByName(node, name) + if field == nil { + return nil + } + node = field + } + return node +} + // EqualNodes returns true if node1 and node2 are equal. // // Nodes that are not of the same repetition type (optional, required, repeated) diff --git a/vendor/github.com/parquet-go/parquet-go/null.go b/vendor/github.com/parquet-go/parquet-go/null.go index 6588580934b..35806b3ecba 100644 --- a/vendor/github.com/parquet-go/parquet-go/null.go +++ b/vendor/github.com/parquet-go/parquet-go/null.go @@ -2,11 +2,12 @@ package parquet import ( "reflect" + "time" "unsafe" + "github.com/parquet-go/bitpack/unsafecast" "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/internal/bytealg" - "github.com/parquet-go/parquet-go/internal/unsafecast" "github.com/parquet-go/parquet-go/sparse" ) @@ -35,10 +36,23 @@ func nullIndexStruct(bits []uint64, rows sparse.Array) { bytealg.Broadcast(unsafecast.Slice[byte](bits), 0xFF) } +func nullIndexTime(bits []uint64, rows sparse.Array) { + for i := range rows.Len() { + t := (*time.Time)(rows.Index(i)) + if !t.IsZero() { + x := uint(i) / 64 + y := uint(i) % 64 + bits[x] |= 1 << y + } + } +} + func nullIndexFuncOf(t reflect.Type) nullIndexFunc { switch t { case reflect.TypeOf(deprecated.Int96{}): return nullIndex[deprecated.Int96] + case reflect.TypeOf(time.Time{}): + return nullIndexTime } switch t.Kind() { diff --git a/vendor/github.com/parquet-go/parquet-go/order.go b/vendor/github.com/parquet-go/parquet-go/order.go index b789f1bb659..6fef7bba2fa 100644 --- a/vendor/github.com/parquet-go/parquet-go/order.go +++ b/vendor/github.com/parquet-go/parquet-go/order.go @@ -3,7 +3,7 @@ package parquet import ( "bytes" - "github.com/parquet-go/parquet-go/internal/unsafecast" + "github.com/parquet-go/bitpack/unsafecast" ) func orderOfBool(data []bool) int { diff --git a/vendor/github.com/parquet-go/parquet-go/page.go b/vendor/github.com/parquet-go/parquet-go/page.go index 787c3777cfa..605fcfb33e1 100644 --- a/vendor/github.com/parquet-go/parquet-go/page.go +++ b/vendor/github.com/parquet-go/parquet-go/page.go @@ -1,14 +1,11 @@ package parquet import ( - "bytes" "errors" "fmt" "io" - "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/encoding" - "github.com/parquet-go/parquet-go/internal/bitpack" "github.com/parquet-go/parquet-go/internal/debug" ) @@ -359,11 +356,7 @@ func CopyPages(dst PageWriter, src PageReader) (numValues int64, err error) { } // errorPage is an implementation of the Page interface which always errors when -// attempting to read its values. -// -// The error page declares that it contains one value (even if it does not) -// as a way to ensure that it is not ignored due to being empty when written -// to a file. +// values are read from it. type errorPage struct { typ Type err error @@ -401,1069 +394,21 @@ func errPageBoundsOutOfRange(i, j, n int64) error { return fmt.Errorf("page bounds out of range [%d:%d]: with length %d", i, j, n) } -type optionalPage struct { - base Page - maxDefinitionLevel byte - definitionLevels []byte -} - -func newOptionalPage(base Page, maxDefinitionLevel byte, definitionLevels []byte) *optionalPage { - return &optionalPage{ - base: base, - maxDefinitionLevel: maxDefinitionLevel, - definitionLevels: definitionLevels, - } -} - -func (page *optionalPage) Type() Type { return page.base.Type() } - -func (page *optionalPage) Column() int { return page.base.Column() } - -func (page *optionalPage) Dictionary() Dictionary { return page.base.Dictionary() } - -func (page *optionalPage) NumRows() int64 { return int64(len(page.definitionLevels)) } - -func (page *optionalPage) NumValues() int64 { return int64(len(page.definitionLevels)) } - -func (page *optionalPage) NumNulls() int64 { - return int64(countLevelsNotEqual(page.definitionLevels, page.maxDefinitionLevel)) -} - -func (page *optionalPage) Bounds() (min, max Value, ok bool) { return page.base.Bounds() } - -func (page *optionalPage) Size() int64 { return int64(len(page.definitionLevels)) + page.base.Size() } - -func (page *optionalPage) RepetitionLevels() []byte { return nil } - -func (page *optionalPage) DefinitionLevels() []byte { return page.definitionLevels } - -func (page *optionalPage) Data() encoding.Values { return page.base.Data() } - -func (page *optionalPage) Values() ValueReader { - return &optionalPageValues{ - page: page, - values: page.base.Values(), - } -} - -func (page *optionalPage) Slice(i, j int64) Page { - maxDefinitionLevel := page.maxDefinitionLevel - definitionLevels := page.definitionLevels - numNulls1 := int64(countLevelsNotEqual(definitionLevels[:i], maxDefinitionLevel)) - numNulls2 := int64(countLevelsNotEqual(definitionLevels[i:j], maxDefinitionLevel)) - return newOptionalPage( - page.base.Slice(i-numNulls1, j-(numNulls1+numNulls2)), - maxDefinitionLevel, - definitionLevels[i:j:j], - ) -} - -type repeatedPage struct { - base Page - maxRepetitionLevel byte - maxDefinitionLevel byte - definitionLevels []byte - repetitionLevels []byte -} - -func newRepeatedPage(base Page, maxRepetitionLevel, maxDefinitionLevel byte, repetitionLevels, definitionLevels []byte) *repeatedPage { - return &repeatedPage{ - base: base, - maxRepetitionLevel: maxRepetitionLevel, - maxDefinitionLevel: maxDefinitionLevel, - definitionLevels: definitionLevels, - repetitionLevels: repetitionLevels, - } -} - -func (page *repeatedPage) Type() Type { return page.base.Type() } - -func (page *repeatedPage) Column() int { return page.base.Column() } - -func (page *repeatedPage) Dictionary() Dictionary { return page.base.Dictionary() } - -func (page *repeatedPage) NumRows() int64 { return int64(countLevelsEqual(page.repetitionLevels, 0)) } - -func (page *repeatedPage) NumValues() int64 { return int64(len(page.definitionLevels)) } - -func (page *repeatedPage) NumNulls() int64 { - return int64(countLevelsNotEqual(page.definitionLevels, page.maxDefinitionLevel)) -} - -func (page *repeatedPage) Bounds() (min, max Value, ok bool) { return page.base.Bounds() } - -func (page *repeatedPage) Size() int64 { - return int64(len(page.repetitionLevels)) + int64(len(page.definitionLevels)) + page.base.Size() -} - -func (page *repeatedPage) RepetitionLevels() []byte { return page.repetitionLevels } - -func (page *repeatedPage) DefinitionLevels() []byte { return page.definitionLevels } - -func (page *repeatedPage) Data() encoding.Values { return page.base.Data() } - -func (page *repeatedPage) Values() ValueReader { - return &repeatedPageValues{ - page: page, - values: page.base.Values(), - } -} - -func (page *repeatedPage) Slice(i, j int64) Page { - numRows := page.NumRows() - if i < 0 || i > numRows { - panic(errPageBoundsOutOfRange(i, j, numRows)) - } - if j < 0 || j > numRows { - panic(errPageBoundsOutOfRange(i, j, numRows)) - } - if i > j { - panic(errPageBoundsOutOfRange(i, j, numRows)) - } - - maxRepetitionLevel := page.maxRepetitionLevel - maxDefinitionLevel := page.maxDefinitionLevel - repetitionLevels := page.repetitionLevels - definitionLevels := page.definitionLevels - - rowIndex0 := 0 - rowIndex1 := len(repetitionLevels) - rowIndex2 := len(repetitionLevels) - - for k, def := range repetitionLevels { - if def == 0 { - if rowIndex0 == int(i) { - rowIndex1 = k - break - } - rowIndex0++ - } - } - - for k, def := range repetitionLevels[rowIndex1:] { - if def == 0 { - if rowIndex0 == int(j) { - rowIndex2 = rowIndex1 + k - break - } - rowIndex0++ - } - } - - numNulls1 := countLevelsNotEqual(definitionLevels[:rowIndex1], maxDefinitionLevel) - numNulls2 := countLevelsNotEqual(definitionLevels[rowIndex1:rowIndex2], maxDefinitionLevel) - - i = int64(rowIndex1 - numNulls1) - j = int64(rowIndex2 - (numNulls1 + numNulls2)) - - return newRepeatedPage( - page.base.Slice(i, j), - maxRepetitionLevel, - maxDefinitionLevel, - repetitionLevels[rowIndex1:rowIndex2:rowIndex2], - definitionLevels[rowIndex1:rowIndex2:rowIndex2], - ) -} - -type booleanPage struct { - typ Type - bits []byte - offset int32 - numValues int32 - columnIndex int16 -} - -func newBooleanPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *booleanPage { - return &booleanPage{ - typ: typ, - bits: values.Boolean()[:bitpack.ByteCount(uint(numValues))], - numValues: numValues, - columnIndex: ^columnIndex, - } -} - -func (page *booleanPage) Type() Type { return page.typ } - -func (page *booleanPage) Column() int { return int(^page.columnIndex) } - -func (page *booleanPage) Dictionary() Dictionary { return nil } - -func (page *booleanPage) NumRows() int64 { return int64(page.numValues) } - -func (page *booleanPage) NumValues() int64 { return int64(page.numValues) } - -func (page *booleanPage) NumNulls() int64 { return 0 } - -func (page *booleanPage) Size() int64 { return int64(len(page.bits)) } - -func (page *booleanPage) RepetitionLevels() []byte { return nil } - -func (page *booleanPage) DefinitionLevels() []byte { return nil } - -func (page *booleanPage) Data() encoding.Values { return encoding.BooleanValues(page.bits) } - -func (page *booleanPage) Values() ValueReader { return &booleanPageValues{page: page} } - -func (page *booleanPage) valueAt(i int) bool { - j := uint32(int(page.offset)+i) / 8 - k := uint32(int(page.offset)+i) % 8 - return ((page.bits[j] >> k) & 1) != 0 -} - -func (page *booleanPage) min() bool { - for i := range int(page.numValues) { - if !page.valueAt(i) { - return false - } - } - return page.numValues > 0 -} - -func (page *booleanPage) max() bool { - for i := range int(page.numValues) { - if page.valueAt(i) { - return true - } - } - return false -} - -func (page *booleanPage) bounds() (min, max bool) { - hasFalse, hasTrue := false, false - - for i := range int(page.numValues) { - v := page.valueAt(i) - if v { - hasTrue = true - } else { - hasFalse = true - } - if hasTrue && hasFalse { - break - } - } - - min = !hasFalse - max = hasTrue - return min, max -} - -func (page *booleanPage) Bounds() (min, max Value, ok bool) { - if ok = page.numValues > 0; ok { - minBool, maxBool := page.bounds() - min = page.makeValue(minBool) - max = page.makeValue(maxBool) - } - return min, max, ok -} - -func (page *booleanPage) Slice(i, j int64) Page { - lowWithOffset := i + int64(page.offset) - highWithOffset := j + int64(page.offset) - - off := lowWithOffset / 8 - end := highWithOffset / 8 - - if (highWithOffset % 8) != 0 { - end++ - } - - return &booleanPage{ - typ: page.typ, - bits: page.bits[off:end], - offset: int32(lowWithOffset % 8), - numValues: int32(j - i), - columnIndex: page.columnIndex, - } -} - -func (page *booleanPage) makeValue(v bool) Value { - value := makeValueBoolean(v) - value.columnIndex = page.columnIndex - return value -} - -type int32Page struct { - typ Type - values []int32 - columnIndex int16 -} - -func newInt32Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int32Page { - return &int32Page{ - typ: typ, - values: values.Int32()[:numValues], - columnIndex: ^columnIndex, - } -} - -func (page *int32Page) Type() Type { return page.typ } - -func (page *int32Page) Column() int { return int(^page.columnIndex) } - -func (page *int32Page) Dictionary() Dictionary { return nil } - -func (page *int32Page) NumRows() int64 { return int64(len(page.values)) } - -func (page *int32Page) NumValues() int64 { return int64(len(page.values)) } - -func (page *int32Page) NumNulls() int64 { return 0 } - -func (page *int32Page) Size() int64 { return 4 * int64(len(page.values)) } - -func (page *int32Page) RepetitionLevels() []byte { return nil } - -func (page *int32Page) DefinitionLevels() []byte { return nil } - -func (page *int32Page) Data() encoding.Values { return encoding.Int32Values(page.values) } - -func (page *int32Page) Values() ValueReader { return &int32PageValues{page: page} } - -func (page *int32Page) min() int32 { return minInt32(page.values) } - -func (page *int32Page) max() int32 { return maxInt32(page.values) } - -func (page *int32Page) bounds() (min, max int32) { return boundsInt32(page.values) } - -func (page *int32Page) Bounds() (min, max Value, ok bool) { - if ok = len(page.values) > 0; ok { - minInt32, maxInt32 := page.bounds() - min = page.makeValue(minInt32) - max = page.makeValue(maxInt32) - } - return min, max, ok -} - -func (page *int32Page) Slice(i, j int64) Page { - return &int32Page{ - typ: page.typ, - values: page.values[i:j], - columnIndex: page.columnIndex, - } -} - -func (page *int32Page) makeValue(v int32) Value { - value := makeValueInt32(v) - value.columnIndex = page.columnIndex - return value -} - -type int64Page struct { - typ Type - values []int64 - columnIndex int16 -} - -func newInt64Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int64Page { - return &int64Page{ - typ: typ, - values: values.Int64()[:numValues], - columnIndex: ^columnIndex, - } -} - -func (page *int64Page) Type() Type { return page.typ } - -func (page *int64Page) Column() int { return int(^page.columnIndex) } - -func (page *int64Page) Dictionary() Dictionary { return nil } - -func (page *int64Page) NumRows() int64 { return int64(len(page.values)) } - -func (page *int64Page) NumValues() int64 { return int64(len(page.values)) } - -func (page *int64Page) NumNulls() int64 { return 0 } - -func (page *int64Page) Size() int64 { return 8 * int64(len(page.values)) } - -func (page *int64Page) RepetitionLevels() []byte { return nil } - -func (page *int64Page) DefinitionLevels() []byte { return nil } - -func (page *int64Page) Data() encoding.Values { return encoding.Int64Values(page.values) } - -func (page *int64Page) Values() ValueReader { return &int64PageValues{page: page} } - -func (page *int64Page) min() int64 { return minInt64(page.values) } - -func (page *int64Page) max() int64 { return maxInt64(page.values) } - -func (page *int64Page) bounds() (min, max int64) { return boundsInt64(page.values) } - -func (page *int64Page) Bounds() (min, max Value, ok bool) { - if ok = len(page.values) > 0; ok { - minInt64, maxInt64 := page.bounds() - min = page.makeValue(minInt64) - max = page.makeValue(maxInt64) - } - return min, max, ok -} - -func (page *int64Page) Slice(i, j int64) Page { - return &int64Page{ - typ: page.typ, - values: page.values[i:j], - columnIndex: page.columnIndex, - } -} - -func (page *int64Page) makeValue(v int64) Value { - value := makeValueInt64(v) - value.columnIndex = page.columnIndex - return value -} - -type int96Page struct { - typ Type - values []deprecated.Int96 - columnIndex int16 -} - -func newInt96Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int96Page { - return &int96Page{ - typ: typ, - values: values.Int96()[:numValues], - columnIndex: ^columnIndex, - } -} - -func (page *int96Page) Type() Type { return page.typ } - -func (page *int96Page) Column() int { return int(^page.columnIndex) } - -func (page *int96Page) Dictionary() Dictionary { return nil } - -func (page *int96Page) NumRows() int64 { return int64(len(page.values)) } - -func (page *int96Page) NumValues() int64 { return int64(len(page.values)) } - -func (page *int96Page) NumNulls() int64 { return 0 } - -func (page *int96Page) Size() int64 { return 12 * int64(len(page.values)) } - -func (page *int96Page) RepetitionLevels() []byte { return nil } - -func (page *int96Page) DefinitionLevels() []byte { return nil } - -func (page *int96Page) Data() encoding.Values { return encoding.Int96Values(page.values) } - -func (page *int96Page) Values() ValueReader { return &int96PageValues{page: page} } - -func (page *int96Page) min() deprecated.Int96 { return deprecated.MinInt96(page.values) } - -func (page *int96Page) max() deprecated.Int96 { return deprecated.MaxInt96(page.values) } - -func (page *int96Page) bounds() (min, max deprecated.Int96) { - return deprecated.MinMaxInt96(page.values) -} - -func (page *int96Page) Bounds() (min, max Value, ok bool) { - if ok = len(page.values) > 0; ok { - minInt96, maxInt96 := page.bounds() - min = page.makeValue(minInt96) - max = page.makeValue(maxInt96) - } - return min, max, ok -} - -func (page *int96Page) Slice(i, j int64) Page { - return &int96Page{ - typ: page.typ, - values: page.values[i:j], - columnIndex: page.columnIndex, - } -} - -func (page *int96Page) makeValue(v deprecated.Int96) Value { - value := makeValueInt96(v) - value.columnIndex = page.columnIndex - return value -} - -type floatPage struct { - typ Type - values []float32 - columnIndex int16 -} - -func newFloatPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *floatPage { - return &floatPage{ - typ: typ, - values: values.Float()[:numValues], - columnIndex: ^columnIndex, - } -} - -func (page *floatPage) Type() Type { return page.typ } - -func (page *floatPage) Column() int { return int(^page.columnIndex) } - -func (page *floatPage) Dictionary() Dictionary { return nil } - -func (page *floatPage) NumRows() int64 { return int64(len(page.values)) } - -func (page *floatPage) NumValues() int64 { return int64(len(page.values)) } - -func (page *floatPage) NumNulls() int64 { return 0 } - -func (page *floatPage) Size() int64 { return 4 * int64(len(page.values)) } - -func (page *floatPage) RepetitionLevels() []byte { return nil } - -func (page *floatPage) DefinitionLevels() []byte { return nil } - -func (page *floatPage) Data() encoding.Values { return encoding.FloatValues(page.values) } - -func (page *floatPage) Values() ValueReader { return &floatPageValues{page: page} } - -func (page *floatPage) min() float32 { return minFloat32(page.values) } - -func (page *floatPage) max() float32 { return maxFloat32(page.values) } - -func (page *floatPage) bounds() (min, max float32) { return boundsFloat32(page.values) } - -func (page *floatPage) Bounds() (min, max Value, ok bool) { - if ok = len(page.values) > 0; ok { - minFloat32, maxFloat32 := page.bounds() - min = page.makeValue(minFloat32) - max = page.makeValue(maxFloat32) - } - return min, max, ok -} - -func (page *floatPage) Slice(i, j int64) Page { - return &floatPage{ - typ: page.typ, - values: page.values[i:j], - columnIndex: page.columnIndex, - } -} - -func (page *floatPage) makeValue(v float32) Value { - value := makeValueFloat(v) - value.columnIndex = page.columnIndex - return value -} - -type doublePage struct { - typ Type - values []float64 - columnIndex int16 -} - -func newDoublePage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *doublePage { - return &doublePage{ - typ: typ, - values: values.Double()[:numValues], - columnIndex: ^columnIndex, - } -} - -func (page *doublePage) Type() Type { return page.typ } - -func (page *doublePage) Column() int { return int(^page.columnIndex) } - -func (page *doublePage) Dictionary() Dictionary { return nil } - -func (page *doublePage) NumRows() int64 { return int64(len(page.values)) } - -func (page *doublePage) NumValues() int64 { return int64(len(page.values)) } - -func (page *doublePage) NumNulls() int64 { return 0 } - -func (page *doublePage) Size() int64 { return 8 * int64(len(page.values)) } - -func (page *doublePage) RepetitionLevels() []byte { return nil } - -func (page *doublePage) DefinitionLevels() []byte { return nil } - -func (page *doublePage) Data() encoding.Values { return encoding.DoubleValues(page.values) } - -func (page *doublePage) Values() ValueReader { return &doublePageValues{page: page} } - -func (page *doublePage) min() float64 { return minFloat64(page.values) } - -func (page *doublePage) max() float64 { return maxFloat64(page.values) } - -func (page *doublePage) bounds() (min, max float64) { return boundsFloat64(page.values) } - -func (page *doublePage) Bounds() (min, max Value, ok bool) { - if ok = len(page.values) > 0; ok { - minFloat64, maxFloat64 := page.bounds() - min = page.makeValue(minFloat64) - max = page.makeValue(maxFloat64) - } - return min, max, ok -} - -func (page *doublePage) Slice(i, j int64) Page { - return &doublePage{ - typ: page.typ, - values: page.values[i:j], - columnIndex: page.columnIndex, - } -} - -func (page *doublePage) makeValue(v float64) Value { - value := makeValueDouble(v) - value.columnIndex = page.columnIndex - return value -} - -type byteArrayPage struct { - typ Type - values []byte - offsets []uint32 - columnIndex int16 -} - -func newByteArrayPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *byteArrayPage { - data, offsets := values.ByteArray() - return &byteArrayPage{ - typ: typ, - values: data, - offsets: offsets[:numValues+1], - columnIndex: ^columnIndex, - } -} - -func (page *byteArrayPage) Type() Type { return page.typ } - -func (page *byteArrayPage) Column() int { return int(^page.columnIndex) } - -func (page *byteArrayPage) Dictionary() Dictionary { return nil } - -func (page *byteArrayPage) NumRows() int64 { return int64(page.len()) } - -func (page *byteArrayPage) NumValues() int64 { return int64(page.len()) } - -func (page *byteArrayPage) NumNulls() int64 { return 0 } - -func (page *byteArrayPage) Size() int64 { return int64(len(page.values)) + 4*int64(len(page.offsets)) } - -func (page *byteArrayPage) RepetitionLevels() []byte { return nil } - -func (page *byteArrayPage) DefinitionLevels() []byte { return nil } - -func (page *byteArrayPage) Data() encoding.Values { - return encoding.ByteArrayValues(page.values, page.offsets) -} - -func (page *byteArrayPage) Values() ValueReader { return &byteArrayPageValues{page: page} } - -func (page *byteArrayPage) len() int { return len(page.offsets) - 1 } - -func (page *byteArrayPage) index(i int) []byte { - j := page.offsets[i+0] - k := page.offsets[i+1] - return page.values[j:k:k] -} - -func (page *byteArrayPage) min() (min []byte) { - if n := page.len(); n > 0 { - min = page.index(0) - - for i := 1; i < n; i++ { - v := page.index(i) - - if bytes.Compare(v, min) < 0 { - min = v - } - } - } - return min -} - -func (page *byteArrayPage) max() (max []byte) { - if n := page.len(); n > 0 { - max = page.index(0) - - for i := 1; i < n; i++ { - v := page.index(i) - - if bytes.Compare(v, max) > 0 { - max = v - } - } - } - return max -} - -func (page *byteArrayPage) bounds() (min, max []byte) { - if n := page.len(); n > 0 { - min = page.index(0) - max = min - - for i := 1; i < n; i++ { - v := page.index(i) - - switch { - case bytes.Compare(v, min) < 0: - min = v - case bytes.Compare(v, max) > 0: - max = v - } - } - } - return min, max -} - -func (page *byteArrayPage) Bounds() (min, max Value, ok bool) { - if ok = len(page.offsets) > 1; ok { - minBytes, maxBytes := page.bounds() - min = page.makeValueBytes(minBytes) - max = page.makeValueBytes(maxBytes) - } - return min, max, ok -} - -func (page *byteArrayPage) cloneValues() []byte { - values := make([]byte, len(page.values)) - copy(values, page.values) - return values -} - -func (page *byteArrayPage) cloneOffsets() []uint32 { - offsets := make([]uint32, len(page.offsets)) - copy(offsets, page.offsets) - return offsets -} - -func (page *byteArrayPage) Slice(i, j int64) Page { - return &byteArrayPage{ - typ: page.typ, - values: page.values, - offsets: page.offsets[i : j+1], - columnIndex: page.columnIndex, - } -} - -func (page *byteArrayPage) makeValueBytes(v []byte) Value { - value := makeValueBytes(ByteArray, v) - value.columnIndex = page.columnIndex - return value -} - -func (page *byteArrayPage) makeValueString(v string) Value { - value := makeValueString(ByteArray, v) - value.columnIndex = page.columnIndex - return value -} - -type fixedLenByteArrayPage struct { - typ Type - data []byte - size int - columnIndex int16 -} - -func newFixedLenByteArrayPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *fixedLenByteArrayPage { - data, size := values.FixedLenByteArray() - return &fixedLenByteArrayPage{ - typ: typ, - data: data[:int(numValues)*size], - size: size, - columnIndex: ^columnIndex, - } -} - -func (page *fixedLenByteArrayPage) Type() Type { return page.typ } - -func (page *fixedLenByteArrayPage) Column() int { return int(^page.columnIndex) } - -func (page *fixedLenByteArrayPage) Dictionary() Dictionary { return nil } - -func (page *fixedLenByteArrayPage) NumRows() int64 { return int64(len(page.data) / page.size) } - -func (page *fixedLenByteArrayPage) NumValues() int64 { return int64(len(page.data) / page.size) } - -func (page *fixedLenByteArrayPage) NumNulls() int64 { return 0 } - -func (page *fixedLenByteArrayPage) Size() int64 { return int64(len(page.data)) } - -func (page *fixedLenByteArrayPage) RepetitionLevels() []byte { return nil } - -func (page *fixedLenByteArrayPage) DefinitionLevels() []byte { return nil } - -func (page *fixedLenByteArrayPage) Data() encoding.Values { - return encoding.FixedLenByteArrayValues(page.data, page.size) -} - -func (page *fixedLenByteArrayPage) Values() ValueReader { - return &fixedLenByteArrayPageValues{page: page} -} - -func (page *fixedLenByteArrayPage) min() []byte { return minFixedLenByteArray(page.data, page.size) } - -func (page *fixedLenByteArrayPage) max() []byte { return maxFixedLenByteArray(page.data, page.size) } - -func (page *fixedLenByteArrayPage) bounds() (min, max []byte) { - return boundsFixedLenByteArray(page.data, page.size) -} - -func (page *fixedLenByteArrayPage) Bounds() (min, max Value, ok bool) { - if ok = len(page.data) > 0; ok { - minBytes, maxBytes := page.bounds() - min = page.makeValueBytes(minBytes) - max = page.makeValueBytes(maxBytes) - } - return min, max, ok -} - -func (page *fixedLenByteArrayPage) Slice(i, j int64) Page { - return &fixedLenByteArrayPage{ - typ: page.typ, - data: page.data[i*int64(page.size) : j*int64(page.size)], - size: page.size, - columnIndex: page.columnIndex, - } -} - -func (page *fixedLenByteArrayPage) makeValueBytes(v []byte) Value { - value := makeValueBytes(FixedLenByteArray, v) - value.columnIndex = page.columnIndex - return value -} - -func (page *fixedLenByteArrayPage) makeValueString(v string) Value { - value := makeValueString(FixedLenByteArray, v) - value.columnIndex = page.columnIndex - return value -} - -type uint32Page struct { - typ Type - values []uint32 - columnIndex int16 -} - -func newUint32Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *uint32Page { - return &uint32Page{ - typ: typ, - values: values.Uint32()[:numValues], - columnIndex: ^columnIndex, - } -} - -func (page *uint32Page) Type() Type { return page.typ } - -func (page *uint32Page) Column() int { return int(^page.columnIndex) } - -func (page *uint32Page) Dictionary() Dictionary { return nil } - -func (page *uint32Page) NumRows() int64 { return int64(len(page.values)) } - -func (page *uint32Page) NumValues() int64 { return int64(len(page.values)) } - -func (page *uint32Page) NumNulls() int64 { return 0 } - -func (page *uint32Page) Size() int64 { return 4 * int64(len(page.values)) } - -func (page *uint32Page) RepetitionLevels() []byte { return nil } - -func (page *uint32Page) DefinitionLevels() []byte { return nil } - -func (page *uint32Page) Data() encoding.Values { return encoding.Uint32Values(page.values) } - -func (page *uint32Page) Values() ValueReader { return &uint32PageValues{page: page} } - -func (page *uint32Page) min() uint32 { return minUint32(page.values) } - -func (page *uint32Page) max() uint32 { return maxUint32(page.values) } - -func (page *uint32Page) bounds() (min, max uint32) { return boundsUint32(page.values) } - -func (page *uint32Page) Bounds() (min, max Value, ok bool) { - if ok = len(page.values) > 0; ok { - minUint32, maxUint32 := page.bounds() - min = page.makeValue(minUint32) - max = page.makeValue(maxUint32) - } - return min, max, ok -} - -func (page *uint32Page) Slice(i, j int64) Page { - return &uint32Page{ - typ: page.typ, - values: page.values[i:j], - columnIndex: page.columnIndex, - } -} - -func (page *uint32Page) makeValue(v uint32) Value { - value := makeValueUint32(v) - value.columnIndex = page.columnIndex - return value -} - -type uint64Page struct { - typ Type - values []uint64 - columnIndex int16 -} - -func newUint64Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *uint64Page { - return &uint64Page{ - typ: typ, - values: values.Uint64()[:numValues], - columnIndex: ^columnIndex, - } -} - -func (page *uint64Page) Type() Type { return page.typ } - -func (page *uint64Page) Column() int { return int(^page.columnIndex) } - -func (page *uint64Page) Dictionary() Dictionary { return nil } - -func (page *uint64Page) NumRows() int64 { return int64(len(page.values)) } - -func (page *uint64Page) NumValues() int64 { return int64(len(page.values)) } - -func (page *uint64Page) NumNulls() int64 { return 0 } - -func (page *uint64Page) Size() int64 { return 8 * int64(len(page.values)) } - -func (page *uint64Page) RepetitionLevels() []byte { return nil } - -func (page *uint64Page) DefinitionLevels() []byte { return nil } - -func (page *uint64Page) Data() encoding.Values { return encoding.Uint64Values(page.values) } - -func (page *uint64Page) Values() ValueReader { return &uint64PageValues{page: page} } - -func (page *uint64Page) min() uint64 { return minUint64(page.values) } - -func (page *uint64Page) max() uint64 { return maxUint64(page.values) } - -func (page *uint64Page) bounds() (min, max uint64) { return boundsUint64(page.values) } - -func (page *uint64Page) Bounds() (min, max Value, ok bool) { - if ok = len(page.values) > 0; ok { - minUint64, maxUint64 := page.bounds() - min = page.makeValue(minUint64) - max = page.makeValue(maxUint64) - } - return min, max, ok -} - -func (page *uint64Page) Slice(i, j int64) Page { - return &uint64Page{ - typ: page.typ, - values: page.values[i:j], - columnIndex: page.columnIndex, - } -} - -func (page *uint64Page) makeValue(v uint64) Value { - value := makeValueUint64(v) - value.columnIndex = page.columnIndex - return value -} - -type be128Page struct { - typ Type - values [][16]byte - columnIndex int16 -} - -func newBE128Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *be128Page { - return &be128Page{ - typ: typ, - values: values.Uint128()[:numValues], - columnIndex: ^columnIndex, - } -} - -func (page *be128Page) Type() Type { return page.typ } - -func (page *be128Page) Column() int { return int(^page.columnIndex) } - -func (page *be128Page) Dictionary() Dictionary { return nil } - -func (page *be128Page) NumRows() int64 { return int64(len(page.values)) } - -func (page *be128Page) NumValues() int64 { return int64(len(page.values)) } - -func (page *be128Page) NumNulls() int64 { return 0 } - -func (page *be128Page) Size() int64 { return 16 * int64(len(page.values)) } - -func (page *be128Page) RepetitionLevels() []byte { return nil } - -func (page *be128Page) DefinitionLevels() []byte { return nil } - -func (page *be128Page) Data() encoding.Values { return encoding.Uint128Values(page.values) } - -func (page *be128Page) Values() ValueReader { return &be128PageValues{page: page} } - -func (page *be128Page) min() []byte { return minBE128(page.values) } - -func (page *be128Page) max() []byte { return maxBE128(page.values) } - -func (page *be128Page) bounds() (min, max []byte) { return boundsBE128(page.values) } - -func (page *be128Page) Bounds() (min, max Value, ok bool) { - if ok = len(page.values) > 0; ok { - minBytes, maxBytes := page.bounds() - min = page.makeValueBytes(minBytes) - max = page.makeValueBytes(maxBytes) - } - return min, max, ok -} - -func (page *be128Page) Slice(i, j int64) Page { - return &be128Page{ - typ: page.typ, - values: page.values[i:j], - columnIndex: page.columnIndex, - } -} - -func (page *be128Page) makeValue(v *[16]byte) Value { - return page.makeValueBytes(v[:]) -} - -func (page *be128Page) makeValueBytes(v []byte) Value { - value := makeValueBytes(FixedLenByteArray, v) - value.columnIndex = page.columnIndex - return value -} - -func (page *be128Page) makeValueString(v string) Value { - value := makeValueString(FixedLenByteArray, v) - value.columnIndex = page.columnIndex - return value -} - -type nullPage struct { - typ Type - column int - count int -} - -func newNullPage(typ Type, columnIndex int16, numValues int32) *nullPage { - return &nullPage{ - typ: typ, - column: int(columnIndex), - count: int(numValues), - } -} - -func (page *nullPage) Type() Type { return page.typ } -func (page *nullPage) Column() int { return page.column } -func (page *nullPage) Dictionary() Dictionary { return nil } -func (page *nullPage) NumRows() int64 { return int64(page.count) } -func (page *nullPage) NumValues() int64 { return int64(page.count) } -func (page *nullPage) NumNulls() int64 { return int64(page.count) } -func (page *nullPage) Bounds() (min, max Value, ok bool) { return } -func (page *nullPage) Size() int64 { return 1 } -func (page *nullPage) Values() ValueReader { - return &nullPageValues{column: page.column, remain: page.count} -} -func (page *nullPage) Slice(i, j int64) Page { - return &nullPage{column: page.column, count: page.count - int(j-i)} -} -func (page *nullPage) RepetitionLevels() []byte { return nil } -func (page *nullPage) DefinitionLevels() []byte { return nil } -func (page *nullPage) Data() encoding.Values { return encoding.Values{} } +var ( + _ Page = (*optionalPage)(nil) + _ Page = (*repeatedPage)(nil) + _ Page = (*booleanPage)(nil) + _ Page = (*int32Page)(nil) + _ Page = (*int64Page)(nil) + _ Page = (*int96Page)(nil) + _ Page = (*floatPage)(nil) + _ Page = (*doublePage)(nil) + _ Page = (*byteArrayPage)(nil) + _ Page = (*fixedLenByteArrayPage)(nil) + _ Page = (*uint32Page)(nil) + _ Page = (*uint64Page)(nil) + _ Page = (*be128Page)(nil) + _ Page = (*nullPage)(nil) + _ Pages = (*singlePage)(nil) + _ PageReader = (*singlePage)(nil) +) diff --git a/vendor/github.com/parquet-go/parquet-go/page_be128.go b/vendor/github.com/parquet-go/parquet-go/page_be128.go new file mode 100644 index 00000000000..17374aea237 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_be128.go @@ -0,0 +1,99 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/parquet-go/encoding" +) + +type be128Page struct { + typ Type + values [][16]byte + columnIndex int16 +} + +func newBE128Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *be128Page { + return &be128Page{ + typ: typ, + values: values.Uint128()[:numValues], + columnIndex: ^columnIndex, + } +} + +func (page *be128Page) Type() Type { return page.typ } + +func (page *be128Page) Column() int { return int(^page.columnIndex) } + +func (page *be128Page) Dictionary() Dictionary { return nil } + +func (page *be128Page) NumRows() int64 { return int64(len(page.values)) } + +func (page *be128Page) NumValues() int64 { return int64(len(page.values)) } + +func (page *be128Page) NumNulls() int64 { return 0 } + +func (page *be128Page) Size() int64 { return 16 * int64(len(page.values)) } + +func (page *be128Page) RepetitionLevels() []byte { return nil } + +func (page *be128Page) DefinitionLevels() []byte { return nil } + +func (page *be128Page) Data() encoding.Values { return encoding.Uint128Values(page.values) } + +func (page *be128Page) Values() ValueReader { return &be128PageValues{page: page} } + +func (page *be128Page) min() []byte { return minBE128(page.values) } + +func (page *be128Page) max() []byte { return maxBE128(page.values) } + +func (page *be128Page) bounds() (min, max []byte) { return boundsBE128(page.values) } + +func (page *be128Page) Bounds() (min, max Value, ok bool) { + if ok = len(page.values) > 0; ok { + minBytes, maxBytes := page.bounds() + min = page.makeValueBytes(minBytes) + max = page.makeValueBytes(maxBytes) + } + return min, max, ok +} + +func (page *be128Page) Slice(i, j int64) Page { + return &be128Page{ + typ: page.typ, + values: page.values[i:j], + columnIndex: page.columnIndex, + } +} + +func (page *be128Page) makeValue(v *[16]byte) Value { + return page.makeValueBytes(v[:]) +} + +func (page *be128Page) makeValueBytes(v []byte) Value { + value := makeValueBytes(FixedLenByteArray, v) + value.columnIndex = page.columnIndex + return value +} + +func (page *be128Page) makeValueString(v string) Value { + value := makeValueString(FixedLenByteArray, v) + value.columnIndex = page.columnIndex + return value +} + +type be128PageValues struct { + page *be128Page + offset int +} + +func (r *be128PageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.values) { + values[n] = r.page.makeValue(&r.page.values[r.offset]) + r.offset++ + n++ + } + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_boolean.go b/vendor/github.com/parquet-go/parquet-go/page_boolean.go new file mode 100644 index 00000000000..ec757993b32 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_boolean.go @@ -0,0 +1,155 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/bitpack" + "github.com/parquet-go/parquet-go/encoding" +) + +type booleanPage struct { + typ Type + bits []byte + offset int32 + numValues int32 + columnIndex int16 +} + +func newBooleanPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *booleanPage { + return &booleanPage{ + typ: typ, + bits: values.Boolean()[:bitpack.ByteCount(uint(numValues))], + numValues: numValues, + columnIndex: ^columnIndex, + } +} + +func (page *booleanPage) Type() Type { return page.typ } + +func (page *booleanPage) Column() int { return int(^page.columnIndex) } + +func (page *booleanPage) Dictionary() Dictionary { return nil } + +func (page *booleanPage) NumRows() int64 { return int64(page.numValues) } + +func (page *booleanPage) NumValues() int64 { return int64(page.numValues) } + +func (page *booleanPage) NumNulls() int64 { return 0 } + +func (page *booleanPage) Size() int64 { return int64(len(page.bits)) } + +func (page *booleanPage) RepetitionLevels() []byte { return nil } + +func (page *booleanPage) DefinitionLevels() []byte { return nil } + +func (page *booleanPage) Data() encoding.Values { return encoding.BooleanValues(page.bits) } + +func (page *booleanPage) Values() ValueReader { return &booleanPageValues{page: page} } + +func (page *booleanPage) valueAt(i int) bool { + j := uint32(int(page.offset)+i) / 8 + k := uint32(int(page.offset)+i) % 8 + return ((page.bits[j] >> k) & 1) != 0 +} + +func (page *booleanPage) min() bool { + for i := range int(page.numValues) { + if !page.valueAt(i) { + return false + } + } + return page.numValues > 0 +} + +func (page *booleanPage) max() bool { + for i := range int(page.numValues) { + if page.valueAt(i) { + return true + } + } + return false +} + +func (page *booleanPage) bounds() (min, max bool) { + hasFalse, hasTrue := false, false + + for i := range int(page.numValues) { + v := page.valueAt(i) + if v { + hasTrue = true + } else { + hasFalse = true + } + if hasTrue && hasFalse { + break + } + } + + min = !hasFalse + max = hasTrue + return min, max +} + +func (page *booleanPage) Bounds() (min, max Value, ok bool) { + if ok = page.numValues > 0; ok { + minBool, maxBool := page.bounds() + min = page.makeValue(minBool) + max = page.makeValue(maxBool) + } + return min, max, ok +} + +func (page *booleanPage) Slice(i, j int64) Page { + lowWithOffset := i + int64(page.offset) + highWithOffset := j + int64(page.offset) + + off := lowWithOffset / 8 + end := highWithOffset / 8 + + if (highWithOffset % 8) != 0 { + end++ + } + + return &booleanPage{ + typ: page.typ, + bits: page.bits[off:end], + offset: int32(lowWithOffset % 8), + numValues: int32(j - i), + columnIndex: page.columnIndex, + } +} + +func (page *booleanPage) makeValue(v bool) Value { + value := makeValueBoolean(v) + value.columnIndex = page.columnIndex + return value +} + +type booleanPageValues struct { + page *booleanPage + offset int +} + +func (r *booleanPageValues) ReadBooleans(values []bool) (n int, err error) { + for n < len(values) && r.offset < int(r.page.numValues) { + values[n] = r.page.valueAt(r.offset) + r.offset++ + n++ + } + if r.offset == int(r.page.numValues) { + err = io.EOF + } + return n, err +} + +func (r *booleanPageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < int(r.page.numValues) { + values[n] = r.page.makeValue(r.page.valueAt(r.offset)) + r.offset++ + n++ + } + if r.offset == int(r.page.numValues) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_byte_array.go b/vendor/github.com/parquet-go/parquet-go/page_byte_array.go new file mode 100644 index 00000000000..48ffa187c6f --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_byte_array.go @@ -0,0 +1,203 @@ +package parquet + +import ( + "bytes" + "io" + + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/encoding/plain" +) + +type byteArrayPage struct { + typ Type + values []byte + offsets []uint32 + columnIndex int16 +} + +func newByteArrayPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *byteArrayPage { + data, offsets := values.ByteArray() + return &byteArrayPage{ + typ: typ, + values: data, + offsets: offsets[:numValues+1], + columnIndex: ^columnIndex, + } +} + +func (page *byteArrayPage) Type() Type { return page.typ } + +func (page *byteArrayPage) Column() int { return int(^page.columnIndex) } + +func (page *byteArrayPage) Dictionary() Dictionary { return nil } + +func (page *byteArrayPage) NumRows() int64 { return int64(page.len()) } + +func (page *byteArrayPage) NumValues() int64 { return int64(page.len()) } + +func (page *byteArrayPage) NumNulls() int64 { return 0 } + +func (page *byteArrayPage) Size() int64 { return int64(len(page.values)) + 4*int64(len(page.offsets)) } + +func (page *byteArrayPage) RepetitionLevels() []byte { return nil } + +func (page *byteArrayPage) DefinitionLevels() []byte { return nil } + +func (page *byteArrayPage) Data() encoding.Values { + return encoding.ByteArrayValues(page.values, page.offsets) +} + +func (page *byteArrayPage) Values() ValueReader { return &byteArrayPageValues{page: page} } + +func (page *byteArrayPage) len() int { return len(page.offsets) - 1 } + +func (page *byteArrayPage) index(i int) []byte { + j := page.offsets[i+0] + k := page.offsets[i+1] + return page.values[j:k:k] +} + +func (page *byteArrayPage) min() (min []byte) { + if n := page.len(); n > 0 { + min = page.index(0) + + for i := 1; i < n; i++ { + v := page.index(i) + + if bytes.Compare(v, min) < 0 { + min = v + } + } + } + return min +} + +func (page *byteArrayPage) max() (max []byte) { + if n := page.len(); n > 0 { + max = page.index(0) + + for i := 1; i < n; i++ { + v := page.index(i) + + if bytes.Compare(v, max) > 0 { + max = v + } + } + } + return max +} + +func (page *byteArrayPage) bounds() (min, max []byte) { + if n := page.len(); n > 0 { + min = page.index(0) + max = min + + for i := 1; i < n; i++ { + v := page.index(i) + + switch { + case bytes.Compare(v, min) < 0: + min = v + case bytes.Compare(v, max) > 0: + max = v + } + } + } + return min, max +} + +func (page *byteArrayPage) Bounds() (min, max Value, ok bool) { + if ok = len(page.offsets) > 1; ok { + minBytes, maxBytes := page.bounds() + min = page.makeValueBytes(minBytes) + max = page.makeValueBytes(maxBytes) + } + return min, max, ok +} + +func (page *byteArrayPage) cloneValues() []byte { + values := make([]byte, len(page.values)) + copy(values, page.values) + return values +} + +func (page *byteArrayPage) cloneOffsets() []uint32 { + offsets := make([]uint32, len(page.offsets)) + copy(offsets, page.offsets) + return offsets +} + +func (page *byteArrayPage) Slice(i, j int64) Page { + return &byteArrayPage{ + typ: page.typ, + values: page.values, + offsets: page.offsets[i : j+1], + columnIndex: page.columnIndex, + } +} + +func (page *byteArrayPage) makeValueBytes(v []byte) Value { + value := makeValueBytes(ByteArray, v) + value.columnIndex = page.columnIndex + return value +} + +func (page *byteArrayPage) makeValueString(v string) Value { + value := makeValueString(ByteArray, v) + value.columnIndex = page.columnIndex + return value +} + +type byteArrayPageValues struct { + page *byteArrayPage + offset int +} + +func (r *byteArrayPageValues) Read(b []byte) (int, error) { + _, n, err := r.readByteArrays(b) + return n, err +} + +func (r *byteArrayPageValues) ReadRequired(values []byte) (int, error) { + return r.ReadByteArrays(values) +} + +func (r *byteArrayPageValues) ReadByteArrays(values []byte) (int, error) { + n, _, err := r.readByteArrays(values) + return n, err +} + +func (r *byteArrayPageValues) readByteArrays(values []byte) (c, n int, err error) { + numValues := r.page.len() + for r.offset < numValues { + b := r.page.index(r.offset) + k := plain.ByteArrayLengthSize + len(b) + if k > (len(values) - n) { + break + } + plain.PutByteArrayLength(values[n:], len(b)) + n += plain.ByteArrayLengthSize + n += copy(values[n:], b) + r.offset++ + c++ + } + if r.offset == numValues { + err = io.EOF + } else if n == 0 && len(values) > 0 { + err = io.ErrShortBuffer + } + return c, n, err +} + +func (r *byteArrayPageValues) ReadValues(values []Value) (n int, err error) { + numValues := r.page.len() + for n < len(values) && r.offset < numValues { + values[n] = r.page.makeValueBytes(r.page.index(r.offset)) + r.offset++ + n++ + } + if r.offset == numValues { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_double.go b/vendor/github.com/parquet-go/parquet-go/page_double.go new file mode 100644 index 00000000000..6ac41ea190a --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_double.go @@ -0,0 +1,104 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/encoding" +) + +type doublePage struct { + typ Type + values []float64 + columnIndex int16 +} + +func newDoublePage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *doublePage { + return &doublePage{ + typ: typ, + values: values.Double()[:numValues], + columnIndex: ^columnIndex, + } +} + +func (page *doublePage) Type() Type { return page.typ } + +func (page *doublePage) Column() int { return int(^page.columnIndex) } + +func (page *doublePage) Dictionary() Dictionary { return nil } + +func (page *doublePage) NumRows() int64 { return int64(len(page.values)) } + +func (page *doublePage) NumValues() int64 { return int64(len(page.values)) } + +func (page *doublePage) NumNulls() int64 { return 0 } + +func (page *doublePage) Size() int64 { return 8 * int64(len(page.values)) } + +func (page *doublePage) RepetitionLevels() []byte { return nil } + +func (page *doublePage) DefinitionLevels() []byte { return nil } + +func (page *doublePage) Data() encoding.Values { return encoding.DoubleValues(page.values) } + +func (page *doublePage) Values() ValueReader { return &doublePageValues{page: page} } + +func (page *doublePage) min() float64 { return minFloat64(page.values) } + +func (page *doublePage) max() float64 { return maxFloat64(page.values) } + +func (page *doublePage) bounds() (min, max float64) { return boundsFloat64(page.values) } + +func (page *doublePage) Bounds() (min, max Value, ok bool) { + if ok = len(page.values) > 0; ok { + minFloat64, maxFloat64 := page.bounds() + min = page.makeValue(minFloat64) + max = page.makeValue(maxFloat64) + } + return min, max, ok +} + +func (page *doublePage) Slice(i, j int64) Page { + return &doublePage{ + typ: page.typ, + values: page.values[i:j], + columnIndex: page.columnIndex, + } +} + +func (page *doublePage) makeValue(v float64) Value { + value := makeValueDouble(v) + value.columnIndex = page.columnIndex + return value +} + +type doublePageValues struct { + page *doublePage + offset int +} + +func (r *doublePageValues) Read(b []byte) (n int, err error) { + n, err = r.ReadDoubles(unsafecast.Slice[float64](b)) + return 8 * n, err +} + +func (r *doublePageValues) ReadDoubles(values []float64) (n int, err error) { + n = copy(values, r.page.values[r.offset:]) + r.offset += n + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} + +func (r *doublePageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.values) { + values[n] = r.page.makeValue(r.page.values[r.offset]) + r.offset++ + n++ + } + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_fixed_len_byte_array.go b/vendor/github.com/parquet-go/parquet-go/page_fixed_len_byte_array.go new file mode 100644 index 00000000000..79bc3e86b73 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_fixed_len_byte_array.go @@ -0,0 +1,125 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/parquet-go/encoding" +) + +type fixedLenByteArrayPage struct { + typ Type + data []byte + size int + columnIndex int16 +} + +func newFixedLenByteArrayPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *fixedLenByteArrayPage { + data, size := values.FixedLenByteArray() + return &fixedLenByteArrayPage{ + typ: typ, + data: data[:int(numValues)*size], + size: size, + columnIndex: ^columnIndex, + } +} + +func (page *fixedLenByteArrayPage) Type() Type { return page.typ } + +func (page *fixedLenByteArrayPage) Column() int { return int(^page.columnIndex) } + +func (page *fixedLenByteArrayPage) Dictionary() Dictionary { return nil } + +func (page *fixedLenByteArrayPage) NumRows() int64 { return int64(len(page.data) / page.size) } + +func (page *fixedLenByteArrayPage) NumValues() int64 { return int64(len(page.data) / page.size) } + +func (page *fixedLenByteArrayPage) NumNulls() int64 { return 0 } + +func (page *fixedLenByteArrayPage) Size() int64 { return int64(len(page.data)) } + +func (page *fixedLenByteArrayPage) RepetitionLevels() []byte { return nil } + +func (page *fixedLenByteArrayPage) DefinitionLevels() []byte { return nil } + +func (page *fixedLenByteArrayPage) Data() encoding.Values { + return encoding.FixedLenByteArrayValues(page.data, page.size) +} + +func (page *fixedLenByteArrayPage) Values() ValueReader { + return &fixedLenByteArrayPageValues{page: page} +} + +func (page *fixedLenByteArrayPage) min() []byte { return minFixedLenByteArray(page.data, page.size) } + +func (page *fixedLenByteArrayPage) max() []byte { return maxFixedLenByteArray(page.data, page.size) } + +func (page *fixedLenByteArrayPage) bounds() (min, max []byte) { + return boundsFixedLenByteArray(page.data, page.size) +} + +func (page *fixedLenByteArrayPage) Bounds() (min, max Value, ok bool) { + if ok = len(page.data) > 0; ok { + minBytes, maxBytes := page.bounds() + min = page.makeValueBytes(minBytes) + max = page.makeValueBytes(maxBytes) + } + return min, max, ok +} + +func (page *fixedLenByteArrayPage) Slice(i, j int64) Page { + return &fixedLenByteArrayPage{ + typ: page.typ, + data: page.data[i*int64(page.size) : j*int64(page.size)], + size: page.size, + columnIndex: page.columnIndex, + } +} + +func (page *fixedLenByteArrayPage) makeValueBytes(v []byte) Value { + value := makeValueBytes(FixedLenByteArray, v) + value.columnIndex = page.columnIndex + return value +} + +func (page *fixedLenByteArrayPage) makeValueString(v string) Value { + value := makeValueString(FixedLenByteArray, v) + value.columnIndex = page.columnIndex + return value +} + +type fixedLenByteArrayPageValues struct { + page *fixedLenByteArrayPage + offset int +} + +func (r *fixedLenByteArrayPageValues) Read(b []byte) (n int, err error) { + n, err = r.ReadFixedLenByteArrays(b) + return n * r.page.size, err +} + +func (r *fixedLenByteArrayPageValues) ReadRequired(values []byte) (int, error) { + return r.ReadFixedLenByteArrays(values) +} + +func (r *fixedLenByteArrayPageValues) ReadFixedLenByteArrays(values []byte) (n int, err error) { + n = copy(values, r.page.data[r.offset:]) / r.page.size + r.offset += n * r.page.size + if r.offset == len(r.page.data) { + err = io.EOF + } else if n == 0 && len(values) > 0 { + err = io.ErrShortBuffer + } + return n, err +} + +func (r *fixedLenByteArrayPageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.data) { + values[n] = r.page.makeValueBytes(r.page.data[r.offset : r.offset+r.page.size]) + r.offset += r.page.size + n++ + } + if r.offset == len(r.page.data) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_float.go b/vendor/github.com/parquet-go/parquet-go/page_float.go new file mode 100644 index 00000000000..c6316c7560b --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_float.go @@ -0,0 +1,104 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/encoding" +) + +type floatPage struct { + typ Type + values []float32 + columnIndex int16 +} + +func newFloatPage(typ Type, columnIndex int16, numValues int32, values encoding.Values) *floatPage { + return &floatPage{ + typ: typ, + values: values.Float()[:numValues], + columnIndex: ^columnIndex, + } +} + +func (page *floatPage) Type() Type { return page.typ } + +func (page *floatPage) Column() int { return int(^page.columnIndex) } + +func (page *floatPage) Dictionary() Dictionary { return nil } + +func (page *floatPage) NumRows() int64 { return int64(len(page.values)) } + +func (page *floatPage) NumValues() int64 { return int64(len(page.values)) } + +func (page *floatPage) NumNulls() int64 { return 0 } + +func (page *floatPage) Size() int64 { return 4 * int64(len(page.values)) } + +func (page *floatPage) RepetitionLevels() []byte { return nil } + +func (page *floatPage) DefinitionLevels() []byte { return nil } + +func (page *floatPage) Data() encoding.Values { return encoding.FloatValues(page.values) } + +func (page *floatPage) Values() ValueReader { return &floatPageValues{page: page} } + +func (page *floatPage) min() float32 { return minFloat32(page.values) } + +func (page *floatPage) max() float32 { return maxFloat32(page.values) } + +func (page *floatPage) bounds() (min, max float32) { return boundsFloat32(page.values) } + +func (page *floatPage) Bounds() (min, max Value, ok bool) { + if ok = len(page.values) > 0; ok { + minFloat32, maxFloat32 := page.bounds() + min = page.makeValue(minFloat32) + max = page.makeValue(maxFloat32) + } + return min, max, ok +} + +func (page *floatPage) Slice(i, j int64) Page { + return &floatPage{ + typ: page.typ, + values: page.values[i:j], + columnIndex: page.columnIndex, + } +} + +func (page *floatPage) makeValue(v float32) Value { + value := makeValueFloat(v) + value.columnIndex = page.columnIndex + return value +} + +type floatPageValues struct { + page *floatPage + offset int +} + +func (r *floatPageValues) Read(b []byte) (n int, err error) { + n, err = r.ReadFloats(unsafecast.Slice[float32](b)) + return 4 * n, err +} + +func (r *floatPageValues) ReadFloats(values []float32) (n int, err error) { + n = copy(values, r.page.values[r.offset:]) + r.offset += n + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} + +func (r *floatPageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.values) { + values[n] = r.page.makeValue(r.page.values[r.offset]) + r.offset++ + n++ + } + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_int32.go b/vendor/github.com/parquet-go/parquet-go/page_int32.go new file mode 100644 index 00000000000..74c83810f38 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_int32.go @@ -0,0 +1,104 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/encoding" +) + +type int32Page struct { + typ Type + values []int32 + columnIndex int16 +} + +func newInt32Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int32Page { + return &int32Page{ + typ: typ, + values: values.Int32()[:numValues], + columnIndex: ^columnIndex, + } +} + +func (page *int32Page) Type() Type { return page.typ } + +func (page *int32Page) Column() int { return int(^page.columnIndex) } + +func (page *int32Page) Dictionary() Dictionary { return nil } + +func (page *int32Page) NumRows() int64 { return int64(len(page.values)) } + +func (page *int32Page) NumValues() int64 { return int64(len(page.values)) } + +func (page *int32Page) NumNulls() int64 { return 0 } + +func (page *int32Page) Size() int64 { return 4 * int64(len(page.values)) } + +func (page *int32Page) RepetitionLevels() []byte { return nil } + +func (page *int32Page) DefinitionLevels() []byte { return nil } + +func (page *int32Page) Data() encoding.Values { return encoding.Int32Values(page.values) } + +func (page *int32Page) Values() ValueReader { return &int32PageValues{page: page} } + +func (page *int32Page) min() int32 { return minInt32(page.values) } + +func (page *int32Page) max() int32 { return maxInt32(page.values) } + +func (page *int32Page) bounds() (min, max int32) { return boundsInt32(page.values) } + +func (page *int32Page) Bounds() (min, max Value, ok bool) { + if ok = len(page.values) > 0; ok { + minInt32, maxInt32 := page.bounds() + min = page.makeValue(minInt32) + max = page.makeValue(maxInt32) + } + return min, max, ok +} + +func (page *int32Page) Slice(i, j int64) Page { + return &int32Page{ + typ: page.typ, + values: page.values[i:j], + columnIndex: page.columnIndex, + } +} + +func (page *int32Page) makeValue(v int32) Value { + value := makeValueInt32(v) + value.columnIndex = page.columnIndex + return value +} + +type int32PageValues struct { + page *int32Page + offset int +} + +func (r *int32PageValues) Read(b []byte) (n int, err error) { + n, err = r.ReadInt32s(unsafecast.Slice[int32](b)) + return 4 * n, err +} + +func (r *int32PageValues) ReadInt32s(values []int32) (n int, err error) { + n = copy(values, r.page.values[r.offset:]) + r.offset += n + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} + +func (r *int32PageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.values) { + values[n] = r.page.makeValue(r.page.values[r.offset]) + r.offset++ + n++ + } + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_int64.go b/vendor/github.com/parquet-go/parquet-go/page_int64.go new file mode 100644 index 00000000000..c660f5adb9e --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_int64.go @@ -0,0 +1,104 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/encoding" +) + +type int64Page struct { + typ Type + values []int64 + columnIndex int16 +} + +func newInt64Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int64Page { + return &int64Page{ + typ: typ, + values: values.Int64()[:numValues], + columnIndex: ^columnIndex, + } +} + +func (page *int64Page) Type() Type { return page.typ } + +func (page *int64Page) Column() int { return int(^page.columnIndex) } + +func (page *int64Page) Dictionary() Dictionary { return nil } + +func (page *int64Page) NumRows() int64 { return int64(len(page.values)) } + +func (page *int64Page) NumValues() int64 { return int64(len(page.values)) } + +func (page *int64Page) NumNulls() int64 { return 0 } + +func (page *int64Page) Size() int64 { return 8 * int64(len(page.values)) } + +func (page *int64Page) RepetitionLevels() []byte { return nil } + +func (page *int64Page) DefinitionLevels() []byte { return nil } + +func (page *int64Page) Data() encoding.Values { return encoding.Int64Values(page.values) } + +func (page *int64Page) Values() ValueReader { return &int64PageValues{page: page} } + +func (page *int64Page) min() int64 { return minInt64(page.values) } + +func (page *int64Page) max() int64 { return maxInt64(page.values) } + +func (page *int64Page) bounds() (min, max int64) { return boundsInt64(page.values) } + +func (page *int64Page) Bounds() (min, max Value, ok bool) { + if ok = len(page.values) > 0; ok { + minInt64, maxInt64 := page.bounds() + min = page.makeValue(minInt64) + max = page.makeValue(maxInt64) + } + return min, max, ok +} + +func (page *int64Page) Slice(i, j int64) Page { + return &int64Page{ + typ: page.typ, + values: page.values[i:j], + columnIndex: page.columnIndex, + } +} + +func (page *int64Page) makeValue(v int64) Value { + value := makeValueInt64(v) + value.columnIndex = page.columnIndex + return value +} + +type int64PageValues struct { + page *int64Page + offset int +} + +func (r *int64PageValues) Read(b []byte) (n int, err error) { + n, err = r.ReadInt64s(unsafecast.Slice[int64](b)) + return 8 * n, err +} + +func (r *int64PageValues) ReadInt64s(values []int64) (n int, err error) { + n = copy(values, r.page.values[r.offset:]) + r.offset += n + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} + +func (r *int64PageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.values) { + values[n] = r.page.makeValue(r.page.values[r.offset]) + r.offset++ + n++ + } + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_int96.go b/vendor/github.com/parquet-go/parquet-go/page_int96.go new file mode 100644 index 00000000000..7cc1d6eb199 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_int96.go @@ -0,0 +1,107 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" +) + +type int96Page struct { + typ Type + values []deprecated.Int96 + columnIndex int16 +} + +func newInt96Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *int96Page { + return &int96Page{ + typ: typ, + values: values.Int96()[:numValues], + columnIndex: ^columnIndex, + } +} + +func (page *int96Page) Type() Type { return page.typ } + +func (page *int96Page) Column() int { return int(^page.columnIndex) } + +func (page *int96Page) Dictionary() Dictionary { return nil } + +func (page *int96Page) NumRows() int64 { return int64(len(page.values)) } + +func (page *int96Page) NumValues() int64 { return int64(len(page.values)) } + +func (page *int96Page) NumNulls() int64 { return 0 } + +func (page *int96Page) Size() int64 { return 12 * int64(len(page.values)) } + +func (page *int96Page) RepetitionLevels() []byte { return nil } + +func (page *int96Page) DefinitionLevels() []byte { return nil } + +func (page *int96Page) Data() encoding.Values { return encoding.Int96Values(page.values) } + +func (page *int96Page) Values() ValueReader { return &int96PageValues{page: page} } + +func (page *int96Page) min() deprecated.Int96 { return deprecated.MinInt96(page.values) } + +func (page *int96Page) max() deprecated.Int96 { return deprecated.MaxInt96(page.values) } + +func (page *int96Page) bounds() (min, max deprecated.Int96) { + return deprecated.MinMaxInt96(page.values) +} + +func (page *int96Page) Bounds() (min, max Value, ok bool) { + if ok = len(page.values) > 0; ok { + minInt96, maxInt96 := page.bounds() + min = page.makeValue(minInt96) + max = page.makeValue(maxInt96) + } + return min, max, ok +} + +func (page *int96Page) Slice(i, j int64) Page { + return &int96Page{ + typ: page.typ, + values: page.values[i:j], + columnIndex: page.columnIndex, + } +} + +func (page *int96Page) makeValue(v deprecated.Int96) Value { + value := makeValueInt96(v) + value.columnIndex = page.columnIndex + return value +} + +type int96PageValues struct { + page *int96Page + offset int +} + +func (r *int96PageValues) Read(b []byte) (n int, err error) { + n, err = r.ReadInt96s(unsafecast.Slice[deprecated.Int96](b)) + return 12 * n, err +} + +func (r *int96PageValues) ReadInt96s(values []deprecated.Int96) (n int, err error) { + n = copy(values, r.page.values[r.offset:]) + r.offset += n + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} + +func (r *int96PageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.values) { + values[n] = r.page.makeValue(r.page.values[r.offset]) + r.offset++ + n++ + } + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_null.go b/vendor/github.com/parquet-go/parquet-go/page_null.go new file mode 100644 index 00000000000..ff554b945f0 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_null.go @@ -0,0 +1,57 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/parquet-go/encoding" +) + +type nullPage struct { + typ Type + column int + count int +} + +func newNullPage(typ Type, columnIndex int16, numValues int32) *nullPage { + return &nullPage{ + typ: typ, + column: int(columnIndex), + count: int(numValues), + } +} + +func (page *nullPage) Type() Type { return page.typ } +func (page *nullPage) Column() int { return page.column } +func (page *nullPage) Dictionary() Dictionary { return nil } +func (page *nullPage) NumRows() int64 { return int64(page.count) } +func (page *nullPage) NumValues() int64 { return int64(page.count) } +func (page *nullPage) NumNulls() int64 { return int64(page.count) } +func (page *nullPage) Bounds() (min, max Value, ok bool) { return } +func (page *nullPage) Size() int64 { return 1 } +func (page *nullPage) Values() ValueReader { + return &nullPageValues{column: page.column, remain: page.count} +} +func (page *nullPage) Slice(i, j int64) Page { + return &nullPage{column: page.column, count: page.count - int(j-i)} +} +func (page *nullPage) RepetitionLevels() []byte { return nil } +func (page *nullPage) DefinitionLevels() []byte { return nil } +func (page *nullPage) Data() encoding.Values { return encoding.Values{} } + +type nullPageValues struct { + column int + remain int +} + +func (r *nullPageValues) ReadValues(values []Value) (n int, err error) { + columnIndex := ^int16(r.column) + values = values[:min(r.remain, len(values))] + for i := range values { + values[i] = Value{columnIndex: columnIndex} + } + r.remain -= len(values) + if r.remain == 0 { + err = io.EOF + } + return len(values), err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_optional.go b/vendor/github.com/parquet-go/parquet-go/page_optional.go new file mode 100644 index 00000000000..9c51d1978ca --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_optional.go @@ -0,0 +1,112 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/parquet-go/encoding" +) + +type optionalPage struct { + base Page + maxDefinitionLevel byte + definitionLevels []byte +} + +func newOptionalPage(base Page, maxDefinitionLevel byte, definitionLevels []byte) *optionalPage { + return &optionalPage{ + base: base, + maxDefinitionLevel: maxDefinitionLevel, + definitionLevels: definitionLevels, + } +} + +func (page *optionalPage) Type() Type { return page.base.Type() } + +func (page *optionalPage) Column() int { return page.base.Column() } + +func (page *optionalPage) Dictionary() Dictionary { return page.base.Dictionary() } + +func (page *optionalPage) NumRows() int64 { return int64(len(page.definitionLevels)) } + +func (page *optionalPage) NumValues() int64 { return int64(len(page.definitionLevels)) } + +func (page *optionalPage) NumNulls() int64 { + return int64(countLevelsNotEqual(page.definitionLevels, page.maxDefinitionLevel)) +} + +func (page *optionalPage) Bounds() (min, max Value, ok bool) { return page.base.Bounds() } + +func (page *optionalPage) Size() int64 { return int64(len(page.definitionLevels)) + page.base.Size() } + +func (page *optionalPage) RepetitionLevels() []byte { return nil } + +func (page *optionalPage) DefinitionLevels() []byte { return page.definitionLevels } + +func (page *optionalPage) Data() encoding.Values { return page.base.Data() } + +func (page *optionalPage) Values() ValueReader { + return &optionalPageValues{ + page: page, + values: page.base.Values(), + } +} + +func (page *optionalPage) Slice(i, j int64) Page { + maxDefinitionLevel := page.maxDefinitionLevel + definitionLevels := page.definitionLevels + numNulls1 := int64(countLevelsNotEqual(definitionLevels[:i], maxDefinitionLevel)) + numNulls2 := int64(countLevelsNotEqual(definitionLevels[i:j], maxDefinitionLevel)) + return newOptionalPage( + page.base.Slice(i-numNulls1, j-(numNulls1+numNulls2)), + maxDefinitionLevel, + definitionLevels[i:j:j], + ) +} + +type optionalPageValues struct { + page *optionalPage + values ValueReader + offset int +} + +func (r *optionalPageValues) ReadValues(values []Value) (n int, err error) { + maxDefinitionLevel := r.page.maxDefinitionLevel + definitionLevels := r.page.definitionLevels + columnIndex := ^int16(r.page.Column()) + + for n < len(values) && r.offset < len(definitionLevels) { + for n < len(values) && r.offset < len(definitionLevels) && definitionLevels[r.offset] != maxDefinitionLevel { + values[n] = Value{ + definitionLevel: definitionLevels[r.offset], + columnIndex: columnIndex, + } + r.offset++ + n++ + } + + i := n + j := r.offset + for i < len(values) && j < len(definitionLevels) && definitionLevels[j] == maxDefinitionLevel { + i++ + j++ + } + + if n < i { + for j, err = r.values.ReadValues(values[n:i]); j > 0; j-- { + values[n].definitionLevel = maxDefinitionLevel + r.offset++ + n++ + } + // Do not return on an io.EOF here as we may still have null values to read. + if err != nil && err != io.EOF { + return n, err + } + err = nil + } + } + + if r.offset == len(definitionLevels) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_repeated.go b/vendor/github.com/parquet-go/parquet-go/page_repeated.go new file mode 100644 index 00000000000..2d850a66e2a --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_repeated.go @@ -0,0 +1,172 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/parquet-go/encoding" +) + +type repeatedPage struct { + base Page + maxRepetitionLevel byte + maxDefinitionLevel byte + definitionLevels []byte + repetitionLevels []byte +} + +func newRepeatedPage(base Page, maxRepetitionLevel, maxDefinitionLevel byte, repetitionLevels, definitionLevels []byte) *repeatedPage { + return &repeatedPage{ + base: base, + maxRepetitionLevel: maxRepetitionLevel, + maxDefinitionLevel: maxDefinitionLevel, + definitionLevels: definitionLevels, + repetitionLevels: repetitionLevels, + } +} + +func (page *repeatedPage) Type() Type { return page.base.Type() } + +func (page *repeatedPage) Column() int { return page.base.Column() } + +func (page *repeatedPage) Dictionary() Dictionary { return page.base.Dictionary() } + +func (page *repeatedPage) NumRows() int64 { return int64(countLevelsEqual(page.repetitionLevels, 0)) } + +func (page *repeatedPage) NumValues() int64 { return int64(len(page.definitionLevels)) } + +func (page *repeatedPage) NumNulls() int64 { + return int64(countLevelsNotEqual(page.definitionLevels, page.maxDefinitionLevel)) +} + +func (page *repeatedPage) Bounds() (min, max Value, ok bool) { return page.base.Bounds() } + +func (page *repeatedPage) Size() int64 { + return int64(len(page.repetitionLevels)) + int64(len(page.definitionLevels)) + page.base.Size() +} + +func (page *repeatedPage) RepetitionLevels() []byte { return page.repetitionLevels } + +func (page *repeatedPage) DefinitionLevels() []byte { return page.definitionLevels } + +func (page *repeatedPage) Data() encoding.Values { return page.base.Data() } + +func (page *repeatedPage) Values() ValueReader { + return &repeatedPageValues{ + page: page, + values: page.base.Values(), + } +} + +func (page *repeatedPage) Slice(i, j int64) Page { + numRows := page.NumRows() + if i < 0 || i > numRows { + panic(errPageBoundsOutOfRange(i, j, numRows)) + } + if j < 0 || j > numRows { + panic(errPageBoundsOutOfRange(i, j, numRows)) + } + if i > j { + panic(errPageBoundsOutOfRange(i, j, numRows)) + } + + maxRepetitionLevel := page.maxRepetitionLevel + maxDefinitionLevel := page.maxDefinitionLevel + repetitionLevels := page.repetitionLevels + definitionLevels := page.definitionLevels + + rowIndex0 := 0 + rowIndex1 := len(repetitionLevels) + rowIndex2 := len(repetitionLevels) + + for k, def := range repetitionLevels { + if def == 0 { + if rowIndex0 == int(i) { + rowIndex1 = k + break + } + rowIndex0++ + } + } + + for k, def := range repetitionLevels[rowIndex1:] { + if def == 0 { + if rowIndex0 == int(j) { + rowIndex2 = rowIndex1 + k + break + } + rowIndex0++ + } + } + + numNulls1 := countLevelsNotEqual(definitionLevels[:rowIndex1], maxDefinitionLevel) + numNulls2 := countLevelsNotEqual(definitionLevels[rowIndex1:rowIndex2], maxDefinitionLevel) + + i = int64(rowIndex1 - numNulls1) + j = int64(rowIndex2 - (numNulls1 + numNulls2)) + + return newRepeatedPage( + page.base.Slice(i, j), + maxRepetitionLevel, + maxDefinitionLevel, + repetitionLevels[rowIndex1:rowIndex2:rowIndex2], + definitionLevels[rowIndex1:rowIndex2:rowIndex2], + ) +} + +type repeatedPageValues struct { + page *repeatedPage + values ValueReader + offset int +} + +func (r *repeatedPageValues) ReadValues(values []Value) (n int, err error) { + maxDefinitionLevel := r.page.maxDefinitionLevel + definitionLevels := r.page.definitionLevels + repetitionLevels := r.page.repetitionLevels + columnIndex := ^int16(r.page.Column()) + + // While we haven't exceeded the output buffer and we haven't exceeded the page size. + for n < len(values) && r.offset < len(definitionLevels) { + + // While we haven't exceeded the output buffer and we haven't exceeded the + // page size AND the current element's definitionLevel is not the + // maxDefinitionLevel (this is a null value), Create the zero values to be + // returned in this run. + for n < len(values) && r.offset < len(definitionLevels) && definitionLevels[r.offset] != maxDefinitionLevel { + values[n] = Value{ + repetitionLevel: repetitionLevels[r.offset], + definitionLevel: definitionLevels[r.offset], + columnIndex: columnIndex, + } + r.offset++ + n++ + } + + i := n + j := r.offset + // Get the length of the run of non-zero values to be copied. + for i < len(values) && j < len(definitionLevels) && definitionLevels[j] == maxDefinitionLevel { + i++ + j++ + } + + // Copy all the non-zero values in this run. + if n < i { + for j, err = r.values.ReadValues(values[n:i]); j > 0; j-- { + values[n].repetitionLevel = repetitionLevels[r.offset] + values[n].definitionLevel = maxDefinitionLevel + r.offset++ + n++ + } + if err != nil && err != io.EOF { + return n, err + } + err = nil + } + } + + if r.offset == len(definitionLevels) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_uint32.go b/vendor/github.com/parquet-go/parquet-go/page_uint32.go new file mode 100644 index 00000000000..9e4c586aad2 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_uint32.go @@ -0,0 +1,104 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/encoding" +) + +type uint32Page struct { + typ Type + values []uint32 + columnIndex int16 +} + +func newUint32Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *uint32Page { + return &uint32Page{ + typ: typ, + values: values.Uint32()[:numValues], + columnIndex: ^columnIndex, + } +} + +func (page *uint32Page) Type() Type { return page.typ } + +func (page *uint32Page) Column() int { return int(^page.columnIndex) } + +func (page *uint32Page) Dictionary() Dictionary { return nil } + +func (page *uint32Page) NumRows() int64 { return int64(len(page.values)) } + +func (page *uint32Page) NumValues() int64 { return int64(len(page.values)) } + +func (page *uint32Page) NumNulls() int64 { return 0 } + +func (page *uint32Page) Size() int64 { return 4 * int64(len(page.values)) } + +func (page *uint32Page) RepetitionLevels() []byte { return nil } + +func (page *uint32Page) DefinitionLevels() []byte { return nil } + +func (page *uint32Page) Data() encoding.Values { return encoding.Uint32Values(page.values) } + +func (page *uint32Page) Values() ValueReader { return &uint32PageValues{page: page} } + +func (page *uint32Page) min() uint32 { return minUint32(page.values) } + +func (page *uint32Page) max() uint32 { return maxUint32(page.values) } + +func (page *uint32Page) bounds() (min, max uint32) { return boundsUint32(page.values) } + +func (page *uint32Page) Bounds() (min, max Value, ok bool) { + if ok = len(page.values) > 0; ok { + minUint32, maxUint32 := page.bounds() + min = page.makeValue(minUint32) + max = page.makeValue(maxUint32) + } + return min, max, ok +} + +func (page *uint32Page) Slice(i, j int64) Page { + return &uint32Page{ + typ: page.typ, + values: page.values[i:j], + columnIndex: page.columnIndex, + } +} + +func (page *uint32Page) makeValue(v uint32) Value { + value := makeValueUint32(v) + value.columnIndex = page.columnIndex + return value +} + +type uint32PageValues struct { + page *uint32Page + offset int +} + +func (r *uint32PageValues) Read(b []byte) (n int, err error) { + n, err = r.ReadUint32s(unsafecast.Slice[uint32](b)) + return 4 * n, err +} + +func (r *uint32PageValues) ReadUint32s(values []uint32) (n int, err error) { + n = copy(values, r.page.values[r.offset:]) + r.offset += n + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} + +func (r *uint32PageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.values) { + values[n] = r.page.makeValue(r.page.values[r.offset]) + r.offset++ + n++ + } + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_uint64.go b/vendor/github.com/parquet-go/parquet-go/page_uint64.go new file mode 100644 index 00000000000..cd01fd4a5d7 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/page_uint64.go @@ -0,0 +1,104 @@ +package parquet + +import ( + "io" + + "github.com/parquet-go/bitpack/unsafecast" + "github.com/parquet-go/parquet-go/encoding" +) + +type uint64Page struct { + typ Type + values []uint64 + columnIndex int16 +} + +func newUint64Page(typ Type, columnIndex int16, numValues int32, values encoding.Values) *uint64Page { + return &uint64Page{ + typ: typ, + values: values.Uint64()[:numValues], + columnIndex: ^columnIndex, + } +} + +func (page *uint64Page) Type() Type { return page.typ } + +func (page *uint64Page) Column() int { return int(^page.columnIndex) } + +func (page *uint64Page) Dictionary() Dictionary { return nil } + +func (page *uint64Page) NumRows() int64 { return int64(len(page.values)) } + +func (page *uint64Page) NumValues() int64 { return int64(len(page.values)) } + +func (page *uint64Page) NumNulls() int64 { return 0 } + +func (page *uint64Page) Size() int64 { return 8 * int64(len(page.values)) } + +func (page *uint64Page) RepetitionLevels() []byte { return nil } + +func (page *uint64Page) DefinitionLevels() []byte { return nil } + +func (page *uint64Page) Data() encoding.Values { return encoding.Uint64Values(page.values) } + +func (page *uint64Page) Values() ValueReader { return &uint64PageValues{page: page} } + +func (page *uint64Page) min() uint64 { return minUint64(page.values) } + +func (page *uint64Page) max() uint64 { return maxUint64(page.values) } + +func (page *uint64Page) bounds() (min, max uint64) { return boundsUint64(page.values) } + +func (page *uint64Page) Bounds() (min, max Value, ok bool) { + if ok = len(page.values) > 0; ok { + minUint64, maxUint64 := page.bounds() + min = page.makeValue(minUint64) + max = page.makeValue(maxUint64) + } + return min, max, ok +} + +func (page *uint64Page) Slice(i, j int64) Page { + return &uint64Page{ + typ: page.typ, + values: page.values[i:j], + columnIndex: page.columnIndex, + } +} + +func (page *uint64Page) makeValue(v uint64) Value { + value := makeValueUint64(v) + value.columnIndex = page.columnIndex + return value +} + +type uint64PageValues struct { + page *uint64Page + offset int +} + +func (r *uint64PageValues) Read(b []byte) (n int, err error) { + n, err = r.ReadUint64s(unsafecast.Slice[uint64](b)) + return 8 * n, err +} + +func (r *uint64PageValues) ReadUint64s(values []uint64) (n int, err error) { + n = copy(values, r.page.values[r.offset:]) + r.offset += n + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} + +func (r *uint64PageValues) ReadValues(values []Value) (n int, err error) { + for n < len(values) && r.offset < len(r.page.values) { + values[n] = r.page.makeValue(r.page.values[r.offset]) + r.offset++ + n++ + } + if r.offset == len(r.page.values) { + err = io.EOF + } + return n, err +} diff --git a/vendor/github.com/parquet-go/parquet-go/page_values.go b/vendor/github.com/parquet-go/parquet-go/page_values.go deleted file mode 100644 index ecbdffb0c8d..00000000000 --- a/vendor/github.com/parquet-go/parquet-go/page_values.go +++ /dev/null @@ -1,487 +0,0 @@ -package parquet - -import ( - "io" - - "github.com/parquet-go/parquet-go/deprecated" - "github.com/parquet-go/parquet-go/encoding/plain" - "github.com/parquet-go/parquet-go/internal/unsafecast" -) - -type optionalPageValues struct { - page *optionalPage - values ValueReader - offset int -} - -func (r *optionalPageValues) ReadValues(values []Value) (n int, err error) { - maxDefinitionLevel := r.page.maxDefinitionLevel - definitionLevels := r.page.definitionLevels - columnIndex := ^int16(r.page.Column()) - - for n < len(values) && r.offset < len(definitionLevels) { - for n < len(values) && r.offset < len(definitionLevels) && definitionLevels[r.offset] != maxDefinitionLevel { - values[n] = Value{ - definitionLevel: definitionLevels[r.offset], - columnIndex: columnIndex, - } - r.offset++ - n++ - } - - i := n - j := r.offset - for i < len(values) && j < len(definitionLevels) && definitionLevels[j] == maxDefinitionLevel { - i++ - j++ - } - - if n < i { - for j, err = r.values.ReadValues(values[n:i]); j > 0; j-- { - values[n].definitionLevel = maxDefinitionLevel - r.offset++ - n++ - } - // Do not return on an io.EOF here as we may still have null values to read. - if err != nil && err != io.EOF { - return n, err - } - err = nil - } - } - - if r.offset == len(definitionLevels) { - err = io.EOF - } - return n, err -} - -type repeatedPageValues struct { - page *repeatedPage - values ValueReader - offset int -} - -func (r *repeatedPageValues) ReadValues(values []Value) (n int, err error) { - maxDefinitionLevel := r.page.maxDefinitionLevel - definitionLevels := r.page.definitionLevels - repetitionLevels := r.page.repetitionLevels - columnIndex := ^int16(r.page.Column()) - - // While we haven't exceeded the output buffer and we haven't exceeded the page size. - for n < len(values) && r.offset < len(definitionLevels) { - - // While we haven't exceeded the output buffer and we haven't exceeded the - // page size AND the current element's definitionLevel is not the - // maxDefinitionLevel (this is a null value), Create the zero values to be - // returned in this run. - for n < len(values) && r.offset < len(definitionLevels) && definitionLevels[r.offset] != maxDefinitionLevel { - values[n] = Value{ - repetitionLevel: repetitionLevels[r.offset], - definitionLevel: definitionLevels[r.offset], - columnIndex: columnIndex, - } - r.offset++ - n++ - } - - i := n - j := r.offset - // Get the length of the run of non-zero values to be copied. - for i < len(values) && j < len(definitionLevels) && definitionLevels[j] == maxDefinitionLevel { - i++ - j++ - } - - // Copy all the non-zero values in this run. - if n < i { - for j, err = r.values.ReadValues(values[n:i]); j > 0; j-- { - values[n].repetitionLevel = repetitionLevels[r.offset] - values[n].definitionLevel = maxDefinitionLevel - r.offset++ - n++ - } - if err != nil && err != io.EOF { - return n, err - } - err = nil - } - } - - if r.offset == len(definitionLevels) { - err = io.EOF - } - return n, err -} - -type booleanPageValues struct { - page *booleanPage - offset int -} - -func (r *booleanPageValues) ReadBooleans(values []bool) (n int, err error) { - for n < len(values) && r.offset < int(r.page.numValues) { - values[n] = r.page.valueAt(r.offset) - r.offset++ - n++ - } - if r.offset == int(r.page.numValues) { - err = io.EOF - } - return n, err -} - -func (r *booleanPageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < int(r.page.numValues) { - values[n] = r.page.makeValue(r.page.valueAt(r.offset)) - r.offset++ - n++ - } - if r.offset == int(r.page.numValues) { - err = io.EOF - } - return n, err -} - -type int32PageValues struct { - page *int32Page - offset int -} - -func (r *int32PageValues) Read(b []byte) (n int, err error) { - n, err = r.ReadInt32s(unsafecast.Slice[int32](b)) - return 4 * n, err -} - -func (r *int32PageValues) ReadInt32s(values []int32) (n int, err error) { - n = copy(values, r.page.values[r.offset:]) - r.offset += n - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -func (r *int32PageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.values) { - values[n] = r.page.makeValue(r.page.values[r.offset]) - r.offset++ - n++ - } - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -type int64PageValues struct { - page *int64Page - offset int -} - -func (r *int64PageValues) Read(b []byte) (n int, err error) { - n, err = r.ReadInt64s(unsafecast.Slice[int64](b)) - return 8 * n, err -} - -func (r *int64PageValues) ReadInt64s(values []int64) (n int, err error) { - n = copy(values, r.page.values[r.offset:]) - r.offset += n - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -func (r *int64PageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.values) { - values[n] = r.page.makeValue(r.page.values[r.offset]) - r.offset++ - n++ - } - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -type int96PageValues struct { - page *int96Page - offset int -} - -func (r *int96PageValues) Read(b []byte) (n int, err error) { - n, err = r.ReadInt96s(unsafecast.Slice[deprecated.Int96](b)) - return 12 * n, err -} - -func (r *int96PageValues) ReadInt96s(values []deprecated.Int96) (n int, err error) { - n = copy(values, r.page.values[r.offset:]) - r.offset += n - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -func (r *int96PageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.values) { - values[n] = r.page.makeValue(r.page.values[r.offset]) - r.offset++ - n++ - } - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -type floatPageValues struct { - page *floatPage - offset int -} - -func (r *floatPageValues) Read(b []byte) (n int, err error) { - n, err = r.ReadFloats(unsafecast.Slice[float32](b)) - return 4 * n, err -} - -func (r *floatPageValues) ReadFloats(values []float32) (n int, err error) { - n = copy(values, r.page.values[r.offset:]) - r.offset += n - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -func (r *floatPageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.values) { - values[n] = r.page.makeValue(r.page.values[r.offset]) - r.offset++ - n++ - } - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -type doublePageValues struct { - page *doublePage - offset int -} - -func (r *doublePageValues) Read(b []byte) (n int, err error) { - n, err = r.ReadDoubles(unsafecast.Slice[float64](b)) - return 8 * n, err -} - -func (r *doublePageValues) ReadDoubles(values []float64) (n int, err error) { - n = copy(values, r.page.values[r.offset:]) - r.offset += n - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -func (r *doublePageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.values) { - values[n] = r.page.makeValue(r.page.values[r.offset]) - r.offset++ - n++ - } - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -type byteArrayPageValues struct { - page *byteArrayPage - offset int -} - -func (r *byteArrayPageValues) Read(b []byte) (int, error) { - _, n, err := r.readByteArrays(b) - return n, err -} - -func (r *byteArrayPageValues) ReadRequired(values []byte) (int, error) { - return r.ReadByteArrays(values) -} - -func (r *byteArrayPageValues) ReadByteArrays(values []byte) (int, error) { - n, _, err := r.readByteArrays(values) - return n, err -} - -func (r *byteArrayPageValues) readByteArrays(values []byte) (c, n int, err error) { - numValues := r.page.len() - for r.offset < numValues { - b := r.page.index(r.offset) - k := plain.ByteArrayLengthSize + len(b) - if k > (len(values) - n) { - break - } - plain.PutByteArrayLength(values[n:], len(b)) - n += plain.ByteArrayLengthSize - n += copy(values[n:], b) - r.offset++ - c++ - } - if r.offset == numValues { - err = io.EOF - } else if n == 0 && len(values) > 0 { - err = io.ErrShortBuffer - } - return c, n, err -} - -func (r *byteArrayPageValues) ReadValues(values []Value) (n int, err error) { - numValues := r.page.len() - for n < len(values) && r.offset < numValues { - values[n] = r.page.makeValueBytes(r.page.index(r.offset)) - r.offset++ - n++ - } - if r.offset == numValues { - err = io.EOF - } - return n, err -} - -type fixedLenByteArrayPageValues struct { - page *fixedLenByteArrayPage - offset int -} - -func (r *fixedLenByteArrayPageValues) Read(b []byte) (n int, err error) { - n, err = r.ReadFixedLenByteArrays(b) - return n * r.page.size, err -} - -func (r *fixedLenByteArrayPageValues) ReadRequired(values []byte) (int, error) { - return r.ReadFixedLenByteArrays(values) -} - -func (r *fixedLenByteArrayPageValues) ReadFixedLenByteArrays(values []byte) (n int, err error) { - n = copy(values, r.page.data[r.offset:]) / r.page.size - r.offset += n * r.page.size - if r.offset == len(r.page.data) { - err = io.EOF - } else if n == 0 && len(values) > 0 { - err = io.ErrShortBuffer - } - return n, err -} - -func (r *fixedLenByteArrayPageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.data) { - values[n] = r.page.makeValueBytes(r.page.data[r.offset : r.offset+r.page.size]) - r.offset += r.page.size - n++ - } - if r.offset == len(r.page.data) { - err = io.EOF - } - return n, err -} - -type uint32PageValues struct { - page *uint32Page - offset int -} - -func (r *uint32PageValues) Read(b []byte) (n int, err error) { - n, err = r.ReadUint32s(unsafecast.Slice[uint32](b)) - return 4 * n, err -} - -func (r *uint32PageValues) ReadUint32s(values []uint32) (n int, err error) { - n = copy(values, r.page.values[r.offset:]) - r.offset += n - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -func (r *uint32PageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.values) { - values[n] = r.page.makeValue(r.page.values[r.offset]) - r.offset++ - n++ - } - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -type uint64PageValues struct { - page *uint64Page - offset int -} - -func (r *uint64PageValues) Read(b []byte) (n int, err error) { - n, err = r.ReadUint64s(unsafecast.Slice[uint64](b)) - return 8 * n, err -} - -func (r *uint64PageValues) ReadUint64s(values []uint64) (n int, err error) { - n = copy(values, r.page.values[r.offset:]) - r.offset += n - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -func (r *uint64PageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.values) { - values[n] = r.page.makeValue(r.page.values[r.offset]) - r.offset++ - n++ - } - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -type be128PageValues struct { - page *be128Page - offset int -} - -func (r *be128PageValues) ReadValues(values []Value) (n int, err error) { - for n < len(values) && r.offset < len(r.page.values) { - values[n] = r.page.makeValue(&r.page.values[r.offset]) - r.offset++ - n++ - } - if r.offset == len(r.page.values) { - err = io.EOF - } - return n, err -} - -type nullPageValues struct { - column int - remain int -} - -func (r *nullPageValues) ReadValues(values []Value) (n int, err error) { - columnIndex := ^int16(r.column) - values = values[:min(r.remain, len(values))] - for i := range values { - values[i] = Value{columnIndex: columnIndex} - } - r.remain -= len(values) - if r.remain == 0 { - err = io.EOF - } - return len(values), err -} diff --git a/vendor/github.com/parquet-go/parquet-go/reader.go b/vendor/github.com/parquet-go/parquet-go/reader.go index c4277f786e9..fd4dc31b40f 100644 --- a/vendor/github.com/parquet-go/parquet-go/reader.go +++ b/vendor/github.com/parquet-go/parquet-go/reader.go @@ -46,7 +46,7 @@ func NewGenericReader[T any](input io.ReaderAt, options ...ReaderOption) *Generi if t == nil { c.Schema = rowGroup.Schema() } else { - c.Schema = schemaOf(dereference(t)) + c.Schema = schemaOf(dereference(t), c.SchemaConfig.StructTags...) } } @@ -80,7 +80,7 @@ func NewGenericRowGroupReader[T any](rowGroup RowGroup, options ...ReaderOption) if t == nil { c.Schema = rowGroup.Schema() } else { - c.Schema = schemaOf(dereference(t)) + c.Schema = schemaOf(dereference(t), c.SchemaConfig.StructTags...) } } diff --git a/vendor/github.com/parquet-go/parquet-go/row.go b/vendor/github.com/parquet-go/parquet-go/row.go index 2c05b6b6478..f30919f2c8b 100644 --- a/vendor/github.com/parquet-go/parquet-go/row.go +++ b/vendor/github.com/parquet-go/parquet-go/row.go @@ -594,9 +594,13 @@ func reconstructFuncOfOptional(columnIndex int16, node Node) (int16, reconstruct return nextColumnIndex, func(value reflect.Value, levels levels, columns [][]Value) error { levels.definitionLevel++ - if columns[0][0].definitionLevel < levels.definitionLevel { - value.SetZero() - return nil + // For empty groups (no columns), we can't check definition levels. + // Treat them as always present (non-null). + if len(columns) > 0 && len(columns[0]) > 0 { + if columns[0][0].definitionLevel < levels.definitionLevel { + value.SetZero() + return nil + } } if value.Kind() == reflect.Ptr { @@ -637,6 +641,12 @@ func reconstructFuncOfRepeated(columnIndex int16, node Node) (int16, reconstruct levels.repetitionDepth++ levels.definitionLevel++ + // Handle empty groups (no columns) + if len(columns) == 0 || len(columns[0]) == 0 { + setMakeSlice(value, 0) + return nil + } + if columns[0][0].definitionLevel < levels.definitionLevel { setMakeSlice(value, 0) return nil diff --git a/vendor/github.com/parquet-go/parquet-go/row_group.go b/vendor/github.com/parquet-go/parquet-go/row_group.go index a0d3d77054e..da80a5c7cb0 100644 --- a/vendor/github.com/parquet-go/parquet-go/row_group.go +++ b/vendor/github.com/parquet-go/parquet-go/row_group.go @@ -215,22 +215,9 @@ func newRowGroupRows(schema *Schema, columns []ColumnChunk, bufferSize int) *row columns: make([]columnChunkRows, len(columns)), rowIndex: -1, } - for i, column := range columns { - var release func(Page) - // Only release pages that are not byte array because the values - // that were read from the page might be retained by the program - // after calls to ReadRows. - switch column.Type().Kind() { - case ByteArray, FixedLenByteArray: - release = func(Page) {} - default: - release = Release - } - r.columns[i].reader.release = release r.columns[i].reader.pages = column.Pages() } - // This finalizer is used to ensure that the goroutines started by calling // init on the underlying page readers will be shutdown in the event that // Close isn't called and the rowGroupRows object is garbage collected. diff --git a/vendor/github.com/parquet-go/parquet-go/schema.go b/vendor/github.com/parquet-go/parquet-go/schema.go index b2049720657..7f26183c003 100644 --- a/vendor/github.com/parquet-go/parquet-go/schema.go +++ b/vendor/github.com/parquet-go/parquet-go/schema.go @@ -2,11 +2,15 @@ package parquet import ( "fmt" + "hash/maphash" + "maps" "math" "reflect" + "slices" "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/google/uuid" @@ -28,6 +32,7 @@ type Schema struct { root Node funcs onceValue[schemaFuncs] state onceValue[schemaState] + cache onceValue[schemaCache] } type schemaFuncs struct { @@ -40,6 +45,34 @@ type schemaState struct { columns [][]string } +type schemaCache struct { + hashSeed maphash.Seed + writeRows cacheMap[writeRowsCacheKey, writeRowsFunc] +} + +type writeRowsCacheKey struct { + gotype reflect.Type + column uint64 +} + +type cacheMap[K comparable, V any] struct { + value atomic.Value // map[K]V +} + +func (c *cacheMap[K, V]) load(k K, f func() V) V { + oldMap, _ := c.value.Load().(map[K]V) + value, ok := oldMap[k] + if ok { + return value + } + value = f() + newMap := make(map[K]V, len(oldMap)+1) + maps.Copy(newMap, oldMap) + newMap[k] = value + c.value.Store(newMap) + return value +} + type onceValue[T any] struct { once sync.Once value *T @@ -141,24 +174,37 @@ func (v *onceValue[T]) load(f func() *T) *T { // Note that the name of the element cannot be changed. // // The schema name is the Go type name of the value. -func SchemaOf(model any) *Schema { - return schemaOf(dereference(reflect.TypeOf(model))) +func SchemaOf(model any, opts ...SchemaOption) *Schema { + cfg := SchemaConfig{} + for _, opt := range opts { + opt.ConfigureSchema(&cfg) + } + return schemaOf(dereference(reflect.TypeOf(model)), cfg.StructTags...) } var cachedSchemas sync.Map // map[reflect.Type]*Schema -func schemaOf(model reflect.Type) *Schema { - cached, _ := cachedSchemas.Load(model) - schema, _ := cached.(*Schema) - if schema != nil { - return schema +func schemaOf(model reflect.Type, tagReplacements ...StructTagOption) *Schema { + cacheable := len(tagReplacements) == 0 + + if cacheable { + cached, _ := cachedSchemas.Load(model) + schema, _ := cached.(*Schema) + if schema != nil { + return schema + } } + if model.Kind() != reflect.Struct { panic("cannot construct parquet schema from value of type " + model.String()) } - schema = NewSchema(model.Name(), nodeOf(model, noTags)) - if actual, loaded := cachedSchemas.LoadOrStore(model, schema); loaded { - schema = actual.(*Schema) + + schema := NewSchema(model.Name(), nodeOf(nil, model, noTags, tagReplacements)) + + if cacheable { + if actual, loaded := cachedSchemas.LoadOrStore(model, schema); loaded { + schema = actual.(*Schema) + } } return schema } @@ -217,6 +263,14 @@ func (s *Schema) lazyLoadState() *schemaState { }) } +func (s *Schema) lazyLoadCache() *schemaCache { + return s.cache.load(func() *schemaCache { + return &schemaCache{ + hashSeed: maphash.MakeSeed(), + } + }) +} + // ConfigureRowGroup satisfies the RowGroupOption interface, allowing Schema // instances to be passed to row group constructors to pre-declare the schema of // the output parquet file. @@ -408,10 +462,10 @@ type structNode struct { fields []structField } -func structNodeOf(t reflect.Type) *structNode { +func structNodeOf(path []string, t reflect.Type, tagReplacements []StructTagOption) *structNode { // Collect struct fields first so we can order them before generating the // column indexes. - fields := structFieldsOf(t) + fields := structFieldsOf(path, t, tagReplacements) s := &structNode{ gotype: t, @@ -421,7 +475,7 @@ func structNodeOf(t reflect.Type) *structNode { for i := range fields { field := structField{name: fields[i].Name, index: fields[i].Index} tags := fromStructTag(fields[i].Tag) - field.Node = makeNodeOf(fields[i].Type, fields[i].Name, tags) + field.Node = makeNodeOf(append(path, fields[i].Name), fields[i].Type, fields[i].Name, tags, tagReplacements) s.fields[i] = field } @@ -429,31 +483,38 @@ func structNodeOf(t reflect.Type) *structNode { return s } -func structFieldsOf(t reflect.Type) []reflect.StructField { - fields := appendStructFields(t, nil, nil, 0) +// structFieldsOf returns the list of fields for the given path and type. Struct tags are replaced +// and fields potentially renamed using the provided options. +func structFieldsOf(path []string, t reflect.Type, tagReplacements []StructTagOption) []reflect.StructField { + return appendStructFields(path, t, nil, nil, 0, tagReplacements) +} - for i := range fields { - f := &fields[i] +func appendStructFields(path []string, t reflect.Type, fields []reflect.StructField, index []int, offset uintptr, tagReplacements []StructTagOption) []reflect.StructField { + for i, n := 0, t.NumField(); i < n; i++ { + f := t.Field(i) - if tag := f.Tag.Get("parquet"); tag != "" { - name, _ := split(tag) - if name != "" { - f.Name = name + // Tag replacements if present. + // Embedded anonymous fields do not extend the + // column path and tags are not used. + if !f.Anonymous { + fpath := append(path, f.Name) + for _, opt := range tagReplacements { + if slices.Equal(fpath, opt.ColumnPath) { + f.Tag = opt.StructTag + } } } - } - return fields -} + ftags := fromStructTag(f.Tag) -func appendStructFields(t reflect.Type, fields []reflect.StructField, index []int, offset uintptr) []reflect.StructField { - for i, n := 0, t.NumField(); i < n; i++ { - f := t.Field(i) - if tag := f.Tag.Get("parquet"); tag != "" { + if tag := ftags.parquet; tag != "" { name, _ := split(tag) if tag != "-," && name == "-" { continue } + if name != "" { + f.Name = name + } } fieldIndex := index[:len(index):len(index)] @@ -462,7 +523,7 @@ func appendStructFields(t reflect.Type, fields []reflect.StructField, index []in f.Offset += offset if f.Anonymous { - fields = appendStructFields(f.Type, fields, fieldIndex, f.Offset) + fields = appendStructFields(path, f.Type, fields, fieldIndex, f.Offset, tagReplacements) } else if f.IsExported() { f.Index = fieldIndex fields = append(fields, f) @@ -578,7 +639,7 @@ func decimalFixedLenByteArraySize(precision int) int { } func forEachStructTagOption(sf reflect.StructField, do func(t reflect.Type, option, args string)) { - if tag := sf.Tag.Get("parquet"); tag != "" { + if tag := fromStructTag(sf.Tag).parquet; tag != "" { _, tag = split(tag) // skip the field name for tag != "" { option := "" @@ -594,7 +655,7 @@ func forEachStructTagOption(sf reflect.StructField, do func(t reflect.Type, opti } } -func nodeOf(t reflect.Type, tags parquetTags) Node { +func nodeOf(path []string, t reflect.Type, tags parquetTags, tagReplacements []StructTagOption) Node { switch t { case reflect.TypeOf(deprecated.Int96{}): return Leaf(Int96Type) @@ -631,13 +692,13 @@ func nodeOf(t reflect.Type, tags parquetTags) Node { n = String() case reflect.Ptr: - n = Optional(nodeOf(t.Elem(), noTags)) + n = Optional(nodeOf(path, t.Elem(), noTags, tagReplacements)) case reflect.Slice: if elem := t.Elem(); elem.Kind() == reflect.Uint8 { // []byte? n = Leaf(ByteArrayType) } else { - n = Repeated(nodeOf(elem, noTags)) + n = Repeated(nodeOf(path, elem, noTags, tagReplacements)) } case reflect.Array: @@ -651,8 +712,8 @@ func nodeOf(t reflect.Type, tags parquetTags) Node { n = JSON() } else { n = Map( - makeNodeOf(t.Key(), t.Name(), tags.getMapKeyNodeTags()), - makeNodeOf(t.Elem(), t.Name(), tags.getMapValueNodeTags()), + makeNodeOf(append(path, "key_value", "key"), t.Key(), t.Name(), tags.getMapKeyNodeTags(), tagReplacements), + makeNodeOf(append(path, "key_value", "value"), t.Elem(), t.Name(), tags.getMapValueNodeTags(), tagReplacements), ) } @@ -674,7 +735,7 @@ func nodeOf(t reflect.Type, tags parquetTags) Node { }) case reflect.Struct: - return structNodeOf(t) + return structNodeOf(path, t, tagReplacements) } if n == nil { @@ -805,7 +866,7 @@ var ( _ WriterOption = (*Schema)(nil) ) -func makeNodeOf(t reflect.Type, name string, tags parquetTags) Node { +func makeNodeOf(path []string, t reflect.Type, name string, tags parquetTags, tagReplacements []StructTagOption) Node { var ( node Node optional bool @@ -851,7 +912,7 @@ func makeNodeOf(t reflect.Type, name string, tags parquetTags) Node { } if t.Kind() == reflect.Map { - node = nodeOf(t, tags) + node = nodeOf(path, t, tags, tagReplacements) } else { forEachTagOption([]string{tags.parquet}, func(option, args string) { switch option { @@ -925,7 +986,7 @@ func makeNodeOf(t reflect.Type, name string, tags parquetTags) Node { case "list": switch t.Kind() { case reflect.Slice: - element := makeNodeOf(t.Elem(), t.Name(), tags.getListElementNodeTags()) + element := makeNodeOf(append(path, "list", "element"), t.Elem(), t.Name(), tags.getListElementNodeTags(), tagReplacements) setNode(element) setList() default: @@ -946,6 +1007,9 @@ func makeNodeOf(t reflect.Type, name string, tags parquetTags) Node { if t.Elem().Kind() != reflect.Uint8 || t.Len() != 16 { throwInvalidTag(t, name, option) } + setNode(UUID()) + case reflect.String: + setNode(UUID()) default: throwInvalidTag(t, name, option) } @@ -1023,6 +1087,18 @@ func makeNodeOf(t reflect.Type, name string, tags parquetTags) Node { throwInvalidTag(t, name, option+args) } setNode(TimestampAdjusted(timeUnit, adjusted)) + case reflect.Ptr: + // Support *time.Time with timestamp tags + if t.Elem() == reflect.TypeOf(time.Time{}) { + timeUnit, adjusted, err := parseTimestampArgs(args) + if err != nil { + throwInvalidTag(t, name, option+args) + } + // Wrap in Optional for schema correctness (nil pointers = NULL values) + setNode(Optional(TimestampAdjusted(timeUnit, adjusted))) + } else { + throwInvalidTag(t, name, option) + } default: switch t { case reflect.TypeOf(time.Time{}): @@ -1055,14 +1131,14 @@ func makeNodeOf(t reflect.Type, name string, tags parquetTags) Node { // Note for strings "optional" applies only to the entire BYTE_ARRAY and // not each individual byte. if optional && !isUint8 { - node = Repeated(Optional(nodeOf(t.Elem(), tags))) + node = Repeated(Optional(nodeOf(path, t.Elem(), tags, tagReplacements))) // Don't also apply "optional" to the whole list. optional = false } } if node == nil { - node = nodeOf(t, tags) + node = nodeOf(path, t, tags, tagReplacements) } if compressed != nil { diff --git a/vendor/github.com/parquet-go/parquet-go/sparse/gather.go b/vendor/github.com/parquet-go/parquet-go/sparse/gather.go index d7d72d091bf..5ead58aa22c 100644 --- a/vendor/github.com/parquet-go/parquet-go/sparse/gather.go +++ b/vendor/github.com/parquet-go/parquet-go/sparse/gather.go @@ -1,6 +1,6 @@ package sparse -import "github.com/parquet-go/parquet-go/internal/unsafecast" +import "github.com/parquet-go/bitpack/unsafecast" func GatherInt32(dst []int32, src Int32Array) int { return GatherUint32(unsafecast.Slice[uint32](dst), src.Uint32Array()) diff --git a/vendor/github.com/parquet-go/parquet-go/tags.go b/vendor/github.com/parquet-go/parquet-go/tags.go index 95ce2801b17..26f62a8d467 100644 --- a/vendor/github.com/parquet-go/parquet-go/tags.go +++ b/vendor/github.com/parquet-go/parquet-go/tags.go @@ -2,9 +2,7 @@ package parquet import "reflect" -var ( - noTags = parquetTags{} -) +var noTags = parquetTags{} // parquetTags represents the superset of all the parquet struct tags that can be used // to configure a field. diff --git a/vendor/github.com/parquet-go/parquet-go/type.go b/vendor/github.com/parquet-go/parquet-go/type.go index 987d0a8bcdd..54b9186802d 100644 --- a/vendor/github.com/parquet-go/parquet-go/type.go +++ b/vendor/github.com/parquet-go/parquet-go/type.go @@ -1,13 +1,7 @@ package parquet import ( - "bytes" - "encoding/json" - "fmt" - "math/bits" "reflect" - "time" - "unsafe" "github.com/parquet-go/parquet-go/deprecated" "github.com/parquet-go/parquet-go/encoding" @@ -288,2183 +282,3 @@ var convertedTypes = [...]deprecated.ConvertedType{ 20: deprecated.Bson, 21: deprecated.Interval, } - -type booleanType struct{} - -func (t booleanType) String() string { return "BOOLEAN" } -func (t booleanType) Kind() Kind { return Boolean } -func (t booleanType) Length() int { return 1 } -func (t booleanType) EstimateSize(n int) int { return (n + 7) / 8 } -func (t booleanType) EstimateNumValues(n int) int { return 8 * n } -func (t booleanType) Compare(a, b Value) int { return compareBool(a.boolean(), b.boolean()) } -func (t booleanType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } -func (t booleanType) LogicalType() *format.LogicalType { return nil } -func (t booleanType) ConvertedType() *deprecated.ConvertedType { return nil } -func (t booleanType) PhysicalType() *format.Type { return &physicalTypes[Boolean] } - -func (t booleanType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newBooleanColumnIndexer() -} - -func (t booleanType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newBooleanColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t booleanType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newBooleanDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t booleanType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newBooleanPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t booleanType) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.BooleanValues(values) -} - -func (t booleanType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeBoolean(dst, src, enc) -} - -func (t booleanType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeBoolean(dst, src, enc) -} - -func (t booleanType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.EstimateSize(numValues) -} - -func (t booleanType) AssignValue(dst reflect.Value, src Value) error { - v := src.boolean() - switch dst.Kind() { - case reflect.Bool: - dst.SetBool(v) - default: - dst.Set(reflect.ValueOf(v)) - } - return nil -} - -func (t booleanType) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *stringType: - return convertStringToBoolean(val) - } - switch typ.Kind() { - case Boolean: - return val, nil - case Int32: - return convertInt32ToBoolean(val) - case Int64: - return convertInt64ToBoolean(val) - case Int96: - return convertInt96ToBoolean(val) - case Float: - return convertFloatToBoolean(val) - case Double: - return convertDoubleToBoolean(val) - case ByteArray, FixedLenByteArray: - return convertByteArrayToBoolean(val) - default: - return makeValueKind(Boolean), nil - } -} - -type int32Type struct{} - -func (t int32Type) String() string { return "INT32" } -func (t int32Type) Kind() Kind { return Int32 } -func (t int32Type) Length() int { return 32 } -func (t int32Type) EstimateSize(n int) int { return 4 * n } -func (t int32Type) EstimateNumValues(n int) int { return n / 4 } -func (t int32Type) Compare(a, b Value) int { return compareInt32(a.int32(), b.int32()) } -func (t int32Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } -func (t int32Type) LogicalType() *format.LogicalType { - return &format.LogicalType{Integer: &format.IntType{ - BitWidth: 32, - IsSigned: true, - }} -} -func (t int32Type) ConvertedType() *deprecated.ConvertedType { return nil } -func (t int32Type) PhysicalType() *format.Type { return &physicalTypes[Int32] } - -func (t int32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newInt32ColumnIndexer() -} - -func (t int32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t int32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t int32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t int32Type) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.Int32ValuesFromBytes(values) -} - -func (t int32Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeInt32(dst, src, enc) -} - -func (t int32Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeInt32(dst, src, enc) -} - -func (t int32Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.EstimateSize(numValues) -} - -func (t int32Type) AssignValue(dst reflect.Value, src Value) error { - v := src.int32() - switch dst.Kind() { - case reflect.Int8, reflect.Int16, reflect.Int32: - dst.SetInt(int64(v)) - case reflect.Uint8, reflect.Uint16, reflect.Uint32: - dst.SetUint(uint64(v)) - default: - dst.Set(reflect.ValueOf(v)) - } - return nil -} - -func (t int32Type) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *stringType: - return convertStringToInt32(val) - } - switch typ.Kind() { - case Boolean: - return convertBooleanToInt32(val) - case Int32: - return val, nil - case Int64: - return convertInt64ToInt32(val) - case Int96: - return convertInt96ToInt32(val) - case Float: - return convertFloatToInt32(val) - case Double: - return convertDoubleToInt32(val) - case ByteArray, FixedLenByteArray: - return convertByteArrayToInt32(val) - default: - return makeValueKind(Int32), nil - } -} - -type int64Type struct{} - -func (t int64Type) String() string { return "INT64" } -func (t int64Type) Kind() Kind { return Int64 } -func (t int64Type) Length() int { return 64 } -func (t int64Type) EstimateSize(n int) int { return 8 * n } -func (t int64Type) EstimateNumValues(n int) int { return n / 8 } -func (t int64Type) Compare(a, b Value) int { return compareInt64(a.int64(), b.int64()) } -func (t int64Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } -func (t int64Type) LogicalType() *format.LogicalType { - return &format.LogicalType{Integer: &format.IntType{ - BitWidth: 64, - IsSigned: true, - }} -} -func (t int64Type) ConvertedType() *deprecated.ConvertedType { return nil } -func (t int64Type) PhysicalType() *format.Type { return &physicalTypes[Int64] } - -func (t int64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newInt64ColumnIndexer() -} - -func (t int64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t int64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t int64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t int64Type) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.Int64ValuesFromBytes(values) -} - -func (t int64Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeInt64(dst, src, enc) -} - -func (t int64Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeInt64(dst, src, enc) -} - -func (t int64Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.EstimateSize(numValues) -} - -func (t int64Type) AssignValue(dst reflect.Value, src Value) error { - v := src.int64() - switch dst.Kind() { - case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int: - dst.SetInt(v) - case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint, reflect.Uintptr: - dst.SetUint(uint64(v)) - default: - dst.Set(reflect.ValueOf(v)) - } - return nil -} - -func (t int64Type) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *stringType: - return convertStringToInt64(val) - } - switch typ.Kind() { - case Boolean: - return convertBooleanToInt64(val) - case Int32: - return convertInt32ToInt64(val) - case Int64: - return val, nil - case Int96: - return convertInt96ToInt64(val) - case Float: - return convertFloatToInt64(val) - case Double: - return convertDoubleToInt64(val) - case ByteArray, FixedLenByteArray: - return convertByteArrayToInt64(val) - default: - return makeValueKind(Int64), nil - } -} - -type int96Type struct{} - -func (t int96Type) String() string { return "INT96" } - -func (t int96Type) Kind() Kind { return Int96 } -func (t int96Type) Length() int { return 96 } -func (t int96Type) EstimateSize(n int) int { return 12 * n } -func (t int96Type) EstimateNumValues(n int) int { return n / 12 } -func (t int96Type) Compare(a, b Value) int { return compareInt96(a.int96(), b.int96()) } -func (t int96Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } -func (t int96Type) LogicalType() *format.LogicalType { return nil } -func (t int96Type) ConvertedType() *deprecated.ConvertedType { return nil } -func (t int96Type) PhysicalType() *format.Type { return &physicalTypes[Int96] } - -func (t int96Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newInt96ColumnIndexer() -} - -func (t int96Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newInt96ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t int96Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newInt96Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t int96Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newInt96Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t int96Type) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.Int96ValuesFromBytes(values) -} - -func (t int96Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeInt96(dst, src, enc) -} - -func (t int96Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeInt96(dst, src, enc) -} - -func (t int96Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.EstimateSize(numValues) -} - -func (t int96Type) AssignValue(dst reflect.Value, src Value) error { - v := src.Int96() - dst.Set(reflect.ValueOf(v)) - return nil -} - -func (t int96Type) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *stringType: - return convertStringToInt96(val) - } - switch typ.Kind() { - case Boolean: - return convertBooleanToInt96(val) - case Int32: - return convertInt32ToInt96(val) - case Int64: - return convertInt64ToInt96(val) - case Int96: - return val, nil - case Float: - return convertFloatToInt96(val) - case Double: - return convertDoubleToInt96(val) - case ByteArray, FixedLenByteArray: - return convertByteArrayToInt96(val) - default: - return makeValueKind(Int96), nil - } -} - -type floatType struct{} - -func (t floatType) String() string { return "FLOAT" } -func (t floatType) Kind() Kind { return Float } -func (t floatType) Length() int { return 32 } -func (t floatType) EstimateSize(n int) int { return 4 * n } -func (t floatType) EstimateNumValues(n int) int { return n / 4 } -func (t floatType) Compare(a, b Value) int { return compareFloat32(a.float(), b.float()) } -func (t floatType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } -func (t floatType) LogicalType() *format.LogicalType { return nil } -func (t floatType) ConvertedType() *deprecated.ConvertedType { return nil } -func (t floatType) PhysicalType() *format.Type { return &physicalTypes[Float] } - -func (t floatType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newFloatColumnIndexer() -} - -func (t floatType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newFloatColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t floatType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newFloatDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t floatType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newFloatPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t floatType) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.FloatValuesFromBytes(values) -} - -func (t floatType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeFloat(dst, src, enc) -} - -func (t floatType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeFloat(dst, src, enc) -} - -func (t floatType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.EstimateSize(numValues) -} - -func (t floatType) AssignValue(dst reflect.Value, src Value) error { - v := src.float() - switch dst.Kind() { - case reflect.Float32, reflect.Float64: - dst.SetFloat(float64(v)) - default: - dst.Set(reflect.ValueOf(v)) - } - return nil -} - -func (t floatType) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *stringType: - return convertStringToFloat(val) - } - switch typ.Kind() { - case Boolean: - return convertBooleanToFloat(val) - case Int32: - return convertInt32ToFloat(val) - case Int64: - return convertInt64ToFloat(val) - case Int96: - return convertInt96ToFloat(val) - case Float: - return val, nil - case Double: - return convertDoubleToFloat(val) - case ByteArray, FixedLenByteArray: - return convertByteArrayToFloat(val) - default: - return makeValueKind(Float), nil - } -} - -type doubleType struct{} - -func (t doubleType) String() string { return "DOUBLE" } -func (t doubleType) Kind() Kind { return Double } -func (t doubleType) Length() int { return 64 } -func (t doubleType) EstimateSize(n int) int { return 8 * n } -func (t doubleType) EstimateNumValues(n int) int { return n / 8 } -func (t doubleType) Compare(a, b Value) int { return compareFloat64(a.double(), b.double()) } -func (t doubleType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } -func (t doubleType) LogicalType() *format.LogicalType { return nil } -func (t doubleType) ConvertedType() *deprecated.ConvertedType { return nil } -func (t doubleType) PhysicalType() *format.Type { return &physicalTypes[Double] } - -func (t doubleType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newDoubleColumnIndexer() -} - -func (t doubleType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newDoubleColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t doubleType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newDoubleDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t doubleType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newDoublePage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t doubleType) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.DoubleValuesFromBytes(values) -} - -func (t doubleType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeDouble(dst, src, enc) -} - -func (t doubleType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeDouble(dst, src, enc) -} - -func (t doubleType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.EstimateSize(numValues) -} - -func (t doubleType) AssignValue(dst reflect.Value, src Value) error { - v := src.double() - switch dst.Kind() { - case reflect.Float32, reflect.Float64: - dst.SetFloat(v) - default: - dst.Set(reflect.ValueOf(v)) - } - return nil -} - -func (t doubleType) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *stringType: - return convertStringToDouble(val) - } - switch typ.Kind() { - case Boolean: - return convertBooleanToDouble(val) - case Int32: - return convertInt32ToDouble(val) - case Int64: - return convertInt64ToDouble(val) - case Int96: - return convertInt96ToDouble(val) - case Float: - return convertFloatToDouble(val) - case Double: - return val, nil - case ByteArray, FixedLenByteArray: - return convertByteArrayToDouble(val) - default: - return makeValueKind(Double), nil - } -} - -type byteArrayType struct{} - -func (t byteArrayType) String() string { return "BYTE_ARRAY" } -func (t byteArrayType) Kind() Kind { return ByteArray } -func (t byteArrayType) Length() int { return 0 } -func (t byteArrayType) EstimateSize(n int) int { return estimatedSizeOfByteArrayValues * n } -func (t byteArrayType) EstimateNumValues(n int) int { return n / estimatedSizeOfByteArrayValues } -func (t byteArrayType) Compare(a, b Value) int { return bytes.Compare(a.byteArray(), b.byteArray()) } -func (t byteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } -func (t byteArrayType) LogicalType() *format.LogicalType { return nil } -func (t byteArrayType) ConvertedType() *deprecated.ConvertedType { return nil } -func (t byteArrayType) PhysicalType() *format.Type { return &physicalTypes[ByteArray] } - -func (t byteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newByteArrayColumnIndexer(sizeLimit) -} - -func (t byteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t byteArrayType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t byteArrayType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t byteArrayType) NewValues(values []byte, offsets []uint32) encoding.Values { - return encoding.ByteArrayValues(values, offsets) -} - -func (t byteArrayType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeByteArray(dst, src, enc) -} - -func (t byteArrayType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeByteArray(dst, src, enc) -} - -func (t byteArrayType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return enc.EstimateDecodeByteArraySize(src) -} - -func (t byteArrayType) AssignValue(dst reflect.Value, src Value) error { - v := src.byteArray() - switch dst.Kind() { - case reflect.String: - dst.SetString(string(v)) - case reflect.Slice: - dst.SetBytes(copyBytes(v)) - default: - val := reflect.ValueOf(string(v)) - dst.Set(val) - } - return nil -} - -func (t byteArrayType) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.Kind() { - case Boolean: - return convertBooleanToByteArray(val) - case Int32: - return convertInt32ToByteArray(val) - case Int64: - return convertInt64ToByteArray(val) - case Int96: - return convertInt96ToByteArray(val) - case Float: - return convertFloatToByteArray(val) - case Double: - return convertDoubleToByteArray(val) - case ByteArray, FixedLenByteArray: - return val, nil - default: - return makeValueKind(ByteArray), nil - } -} - -type fixedLenByteArrayType struct{ length int } - -func (t fixedLenByteArrayType) String() string { - return fmt.Sprintf("FIXED_LEN_BYTE_ARRAY(%d)", t.length) -} - -func (t fixedLenByteArrayType) Kind() Kind { return FixedLenByteArray } - -func (t fixedLenByteArrayType) Length() int { return t.length } - -func (t fixedLenByteArrayType) EstimateSize(n int) int { return t.length * n } - -func (t fixedLenByteArrayType) EstimateNumValues(n int) int { return n / t.length } - -func (t fixedLenByteArrayType) Compare(a, b Value) int { - return bytes.Compare(a.byteArray(), b.byteArray()) -} - -func (t fixedLenByteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } - -func (t fixedLenByteArrayType) LogicalType() *format.LogicalType { return nil } - -func (t fixedLenByteArrayType) ConvertedType() *deprecated.ConvertedType { return nil } - -func (t fixedLenByteArrayType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } - -func (t fixedLenByteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newFixedLenByteArrayColumnIndexer(t.length, sizeLimit) -} - -func (t fixedLenByteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newFixedLenByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t fixedLenByteArrayType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t fixedLenByteArrayType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newFixedLenByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t fixedLenByteArrayType) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.FixedLenByteArrayValues(values, t.length) -} - -func (t fixedLenByteArrayType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeFixedLenByteArray(dst, src, enc) -} - -func (t fixedLenByteArrayType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeFixedLenByteArray(dst, src, enc) -} - -func (t fixedLenByteArrayType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.EstimateSize(numValues) -} - -func (t fixedLenByteArrayType) AssignValue(dst reflect.Value, src Value) error { - v := src.byteArray() - switch dst.Kind() { - case reflect.Array: - if dst.Type().Elem().Kind() == reflect.Uint8 && dst.Len() == len(v) { - // This code could be implemented as a call to reflect.Copy but - // it would require creating a reflect.Value from v which causes - // the heap allocation to pack the []byte value. To avoid this - // overhead we instead convert the reflect.Value holding the - // destination array into a byte slice which allows us to use - // a more efficient call to copy. - d := unsafe.Slice((*byte)(reflectValueData(dst)), len(v)) - copy(d, v) - return nil - } - case reflect.Slice: - dst.SetBytes(copyBytes(v)) - return nil - } - - val := reflect.ValueOf(copyBytes(v)) - dst.Set(val) - return nil -} - -func reflectValueData(v reflect.Value) unsafe.Pointer { - return (*[2]unsafe.Pointer)(unsafe.Pointer(&v))[1] -} - -func (t fixedLenByteArrayType) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *stringType: - return convertStringToFixedLenByteArray(val, t.length) - } - switch typ.Kind() { - case Boolean: - return convertBooleanToFixedLenByteArray(val, t.length) - case Int32: - return convertInt32ToFixedLenByteArray(val, t.length) - case Int64: - return convertInt64ToFixedLenByteArray(val, t.length) - case Int96: - return convertInt96ToFixedLenByteArray(val, t.length) - case Float: - return convertFloatToFixedLenByteArray(val, t.length) - case Double: - return convertDoubleToFixedLenByteArray(val, t.length) - case ByteArray, FixedLenByteArray: - return convertByteArrayToFixedLenByteArray(val, t.length) - default: - return makeValueBytes(FixedLenByteArray, make([]byte, t.length)), nil - } -} - -type uint32Type struct{ int32Type } - -func (t uint32Type) Compare(a, b Value) int { - return compareUint32(a.uint32(), b.uint32()) -} - -func (t uint32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newUint32ColumnIndexer() -} - -func (t uint32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t uint32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newUint32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t uint32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newUint32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -type uint64Type struct{ int64Type } - -func (t uint64Type) Compare(a, b Value) int { - return compareUint64(a.uint64(), b.uint64()) -} - -func (t uint64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newUint64ColumnIndexer() -} - -func (t uint64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t uint64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newUint64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t uint64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newUint64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -// BE128 stands for "big-endian 128 bits". This type is used as a special case -// for fixed-length byte arrays of 16 bytes, which are commonly used to -// represent columns of random unique identifiers such as UUIDs. -// -// Comparisons of BE128 values use the natural byte order, the zeroth byte is -// the most significant byte. -// -// The special case is intended to provide optimizations based on the knowledge -// that the values are 16 bytes long. Stronger type checking can also be applied -// by the compiler when using [16]byte values rather than []byte, reducing the -// risk of errors on these common code paths. -type be128Type struct{} - -func (t be128Type) String() string { return "FIXED_LEN_BYTE_ARRAY(16)" } - -func (t be128Type) Kind() Kind { return FixedLenByteArray } - -func (t be128Type) Length() int { return 16 } - -func (t be128Type) EstimateSize(n int) int { return 16 * n } - -func (t be128Type) EstimateNumValues(n int) int { return n / 16 } - -func (t be128Type) Compare(a, b Value) int { return compareBE128(a.be128(), b.be128()) } - -func (t be128Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } - -func (t be128Type) LogicalType() *format.LogicalType { return nil } - -func (t be128Type) ConvertedType() *deprecated.ConvertedType { return nil } - -func (t be128Type) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } - -func (t be128Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newBE128ColumnIndexer() -} - -func (t be128Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newBE128ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t be128Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newBE128Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t be128Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newBE128Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t be128Type) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.FixedLenByteArrayValues(values, 16) -} - -func (t be128Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeFixedLenByteArray(dst, src, enc) -} - -func (t be128Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeFixedLenByteArray(dst, src, enc) -} - -func (t be128Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.EstimateSize(numValues) -} - -func (t be128Type) AssignValue(dst reflect.Value, src Value) error { - return fixedLenByteArrayType{length: 16}.AssignValue(dst, src) -} - -func (t be128Type) ConvertValue(val Value, typ Type) (Value, error) { - return fixedLenByteArrayType{length: 16}.ConvertValue(val, typ) -} - -// FixedLenByteArrayType constructs a type for fixed-length values of the given -// size (in bytes). -func FixedLenByteArrayType(length int) Type { - switch length { - case 16: - return be128Type{} - default: - return fixedLenByteArrayType{length: length} - } -} - -// Int constructs a leaf node of signed integer logical type of the given bit -// width. -// -// The bit width must be one of 8, 16, 32, 64, or the function will panic. -func Int(bitWidth int) Node { - return Leaf(integerType(bitWidth, &signedIntTypes)) -} - -// Uint constructs a leaf node of unsigned integer logical type of the given -// bit width. -// -// The bit width must be one of 8, 16, 32, 64, or the function will panic. -func Uint(bitWidth int) Node { - return Leaf(integerType(bitWidth, &unsignedIntTypes)) -} - -func integerType(bitWidth int, types *[4]intType) *intType { - switch bitWidth { - case 8: - return &types[0] - case 16: - return &types[1] - case 32: - return &types[2] - case 64: - return &types[3] - default: - panic(fmt.Sprintf("cannot create a %d bits parquet integer node", bitWidth)) - } -} - -var signedIntTypes = [...]intType{ - {BitWidth: 8, IsSigned: true}, - {BitWidth: 16, IsSigned: true}, - {BitWidth: 32, IsSigned: true}, - {BitWidth: 64, IsSigned: true}, -} - -var unsignedIntTypes = [...]intType{ - {BitWidth: 8, IsSigned: false}, - {BitWidth: 16, IsSigned: false}, - {BitWidth: 32, IsSigned: false}, - {BitWidth: 64, IsSigned: false}, -} - -type intType format.IntType - -func (t *intType) baseType() Type { - if t.IsSigned { - if t.BitWidth == 64 { - return int64Type{} - } else { - return int32Type{} - } - } else { - if t.BitWidth == 64 { - return uint64Type{} - } else { - return uint32Type{} - } - } -} - -func (t *intType) String() string { return (*format.IntType)(t).String() } - -func (t *intType) Kind() Kind { return t.baseType().Kind() } - -func (t *intType) Length() int { return int(t.BitWidth) } - -func (t *intType) EstimateSize(n int) int { return (int(t.BitWidth) / 8) * n } - -func (t *intType) EstimateNumValues(n int) int { return n / (int(t.BitWidth) / 8) } - -func (t *intType) Compare(a, b Value) int { - // This code is similar to t.baseType().Compare(a,b) but comparison methods - // tend to be invoked a lot (e.g. when sorting) so avoiding the interface - // indirection in this case yields much better throughput in some cases. - if t.BitWidth == 64 { - i1 := a.int64() - i2 := b.int64() - if t.IsSigned { - return compareInt64(i1, i2) - } else { - return compareUint64(uint64(i1), uint64(i2)) - } - } else { - i1 := a.int32() - i2 := b.int32() - if t.IsSigned { - return compareInt32(i1, i2) - } else { - return compareUint32(uint32(i1), uint32(i2)) - } - } -} - -func (t *intType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() } - -func (t *intType) PhysicalType() *format.Type { return t.baseType().PhysicalType() } - -func (t *intType) LogicalType() *format.LogicalType { - return &format.LogicalType{Integer: (*format.IntType)(t)} -} - -func (t *intType) ConvertedType() *deprecated.ConvertedType { - convertedType := bits.Len8(uint8(t.BitWidth)/8) - 1 // 8=>0, 16=>1, 32=>2, 64=>4 - if t.IsSigned { - convertedType += int(deprecated.Int8) - } else { - convertedType += int(deprecated.Uint8) - } - return &convertedTypes[convertedType] -} - -func (t *intType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return t.baseType().NewColumnIndexer(sizeLimit) -} - -func (t *intType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return t.baseType().NewColumnBuffer(columnIndex, numValues) -} - -func (t *intType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return t.baseType().NewDictionary(columnIndex, numValues, data) -} - -func (t *intType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return t.baseType().NewPage(columnIndex, numValues, data) -} - -func (t *intType) NewValues(values []byte, offsets []uint32) encoding.Values { - return t.baseType().NewValues(values, offsets) -} - -func (t *intType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return t.baseType().Encode(dst, src, enc) -} - -func (t *intType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return t.baseType().Decode(dst, src, enc) -} - -func (t *intType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.baseType().EstimateDecodeSize(numValues, src, enc) -} - -func (t *intType) AssignValue(dst reflect.Value, src Value) error { - if t.BitWidth == 64 { - return int64Type{}.AssignValue(dst, src) - } else { - return int32Type{}.AssignValue(dst, src) - } -} - -func (t *intType) ConvertValue(val Value, typ Type) (Value, error) { - if t.BitWidth == 64 { - return int64Type{}.ConvertValue(val, typ) - } else { - return int32Type{}.ConvertValue(val, typ) - } -} - -// Decimal constructs a leaf node of decimal logical type with the given -// scale, precision, and underlying type. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal -func Decimal(scale, precision int, typ Type) Node { - switch typ.Kind() { - case Int32, Int64, FixedLenByteArray: - default: - panic("DECIMAL node must annotate Int32, Int64 or FixedLenByteArray but got " + typ.String()) - } - return Leaf(&decimalType{ - decimal: format.DecimalType{ - Scale: int32(scale), - Precision: int32(precision), - }, - Type: typ, - }) -} - -type decimalType struct { - decimal format.DecimalType - Type -} - -func (t *decimalType) String() string { return t.decimal.String() } - -func (t *decimalType) LogicalType() *format.LogicalType { - return &format.LogicalType{Decimal: &t.decimal} -} - -func (t *decimalType) ConvertedType() *deprecated.ConvertedType { - return &convertedTypes[deprecated.Decimal] -} - -// String constructs a leaf node of UTF8 logical type. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string -func String() Node { return Leaf(&stringType{}) } - -type stringType format.StringType - -func (t *stringType) String() string { return (*format.StringType)(t).String() } - -func (t *stringType) Kind() Kind { return ByteArray } - -func (t *stringType) Length() int { return 0 } - -func (t *stringType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } - -func (t *stringType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } - -func (t *stringType) Compare(a, b Value) int { - return bytes.Compare(a.byteArray(), b.byteArray()) -} - -func (t *stringType) ColumnOrder() *format.ColumnOrder { - return &typeDefinedColumnOrder -} - -func (t *stringType) PhysicalType() *format.Type { - return &physicalTypes[ByteArray] -} - -func (t *stringType) LogicalType() *format.LogicalType { - return &format.LogicalType{UTF8: (*format.StringType)(t)} -} - -func (t *stringType) ConvertedType() *deprecated.ConvertedType { - return &convertedTypes[deprecated.UTF8] -} - -func (t *stringType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newByteArrayColumnIndexer(sizeLimit) -} - -func (t *stringType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t *stringType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t *stringType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t *stringType) NewValues(values []byte, offsets []uint32) encoding.Values { - return encoding.ByteArrayValues(values, offsets) -} - -func (t *stringType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeByteArray(dst, src, enc) -} - -func (t *stringType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeByteArray(dst, src, enc) -} - -func (t *stringType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) -} - -func (t *stringType) AssignValue(dst reflect.Value, src Value) error { - return byteArrayType{}.AssignValue(dst, src) -} - -func (t *stringType) ConvertValue(val Value, typ Type) (Value, error) { - switch t2 := typ.(type) { - case *dateType: - return convertDateToString(val) - case *timeType: - tz := t2.tz() - if t2.Unit.Micros != nil { - return convertTimeMicrosToString(val, tz) - } else { - return convertTimeMillisToString(val, tz) - } - } - switch typ.Kind() { - case Boolean: - return convertBooleanToString(val) - case Int32: - return convertInt32ToString(val) - case Int64: - return convertInt64ToString(val) - case Int96: - return convertInt96ToString(val) - case Float: - return convertFloatToString(val) - case Double: - return convertDoubleToString(val) - case ByteArray: - return val, nil - case FixedLenByteArray: - return convertFixedLenByteArrayToString(val) - default: - return makeValueKind(ByteArray), nil - } -} - -// UUID constructs a leaf node of UUID logical type. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid -func UUID() Node { return Leaf(&uuidType{}) } - -type uuidType format.UUIDType - -func (t *uuidType) String() string { return (*format.UUIDType)(t).String() } - -func (t *uuidType) Kind() Kind { return be128Type{}.Kind() } - -func (t *uuidType) Length() int { return be128Type{}.Length() } - -func (t *uuidType) EstimateSize(n int) int { return be128Type{}.EstimateSize(n) } - -func (t *uuidType) EstimateNumValues(n int) int { return be128Type{}.EstimateNumValues(n) } - -func (t *uuidType) Compare(a, b Value) int { return be128Type{}.Compare(a, b) } - -func (t *uuidType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } - -func (t *uuidType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } - -func (t *uuidType) LogicalType() *format.LogicalType { - return &format.LogicalType{UUID: (*format.UUIDType)(t)} -} - -func (t *uuidType) ConvertedType() *deprecated.ConvertedType { return nil } - -func (t *uuidType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return be128Type{}.NewColumnIndexer(sizeLimit) -} - -func (t *uuidType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return be128Type{}.NewDictionary(columnIndex, numValues, data) -} - -func (t *uuidType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return be128Type{}.NewColumnBuffer(columnIndex, numValues) -} - -func (t *uuidType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return be128Type{}.NewPage(columnIndex, numValues, data) -} - -func (t *uuidType) NewValues(values []byte, offsets []uint32) encoding.Values { - return be128Type{}.NewValues(values, offsets) -} - -func (t *uuidType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return be128Type{}.Encode(dst, src, enc) -} - -func (t *uuidType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return be128Type{}.Decode(dst, src, enc) -} - -func (t *uuidType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return be128Type{}.EstimateDecodeSize(numValues, src, enc) -} - -func (t *uuidType) AssignValue(dst reflect.Value, src Value) error { - return be128Type{}.AssignValue(dst, src) -} - -func (t *uuidType) ConvertValue(val Value, typ Type) (Value, error) { - return be128Type{}.ConvertValue(val, typ) -} - -// Enum constructs a leaf node with a logical type representing enumerations. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#enum -func Enum() Node { return Leaf(&enumType{}) } - -type enumType format.EnumType - -func (t *enumType) String() string { return (*format.EnumType)(t).String() } - -func (t *enumType) Kind() Kind { return new(stringType).Kind() } - -func (t *enumType) Length() int { return new(stringType).Length() } - -func (t *enumType) EstimateSize(n int) int { return new(stringType).EstimateSize(n) } - -func (t *enumType) EstimateNumValues(n int) int { return new(stringType).EstimateNumValues(n) } - -func (t *enumType) Compare(a, b Value) int { return new(stringType).Compare(a, b) } - -func (t *enumType) ColumnOrder() *format.ColumnOrder { return new(stringType).ColumnOrder() } - -func (t *enumType) PhysicalType() *format.Type { return new(stringType).PhysicalType() } - -func (t *enumType) LogicalType() *format.LogicalType { - return &format.LogicalType{Enum: (*format.EnumType)(t)} -} - -func (t *enumType) ConvertedType() *deprecated.ConvertedType { - return &convertedTypes[deprecated.Enum] -} - -func (t *enumType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return new(stringType).NewColumnIndexer(sizeLimit) -} - -func (t *enumType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return new(stringType).NewDictionary(columnIndex, numValues, data) -} - -func (t *enumType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return new(stringType).NewColumnBuffer(columnIndex, numValues) -} - -func (t *enumType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return new(stringType).NewPage(columnIndex, numValues, data) -} - -func (t *enumType) NewValues(values []byte, offsets []uint32) encoding.Values { - return new(stringType).NewValues(values, offsets) -} - -func (t *enumType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return new(stringType).Encode(dst, src, enc) -} - -func (t *enumType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return new(stringType).Decode(dst, src, enc) -} - -func (t *enumType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return new(stringType).EstimateDecodeSize(numValues, src, enc) -} - -func (t *enumType) AssignValue(dst reflect.Value, src Value) error { - return new(stringType).AssignValue(dst, src) -} - -func (t *enumType) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *byteArrayType, *stringType, *enumType: - return val, nil - default: - return val, invalidConversion(val, "ENUM", typ.String()) - } -} - -// JSON constructs a leaf node of JSON logical type. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#json -func JSON() Node { return Leaf(&jsonType{}) } - -type jsonType format.JsonType - -func (t *jsonType) String() string { return (*format.JsonType)(t).String() } - -func (t *jsonType) Kind() Kind { return byteArrayType{}.Kind() } - -func (t *jsonType) Length() int { return byteArrayType{}.Length() } - -func (t *jsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } - -func (t *jsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } - -func (t *jsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) } - -func (t *jsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() } - -func (t *jsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() } - -func (t *jsonType) LogicalType() *format.LogicalType { - return &format.LogicalType{Json: (*format.JsonType)(t)} -} - -func (t *jsonType) ConvertedType() *deprecated.ConvertedType { - return &convertedTypes[deprecated.Json] -} - -func (t *jsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return byteArrayType{}.NewColumnIndexer(sizeLimit) -} - -func (t *jsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return byteArrayType{}.NewDictionary(columnIndex, numValues, data) -} - -func (t *jsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return byteArrayType{}.NewColumnBuffer(columnIndex, numValues) -} - -func (t *jsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return byteArrayType{}.NewPage(columnIndex, numValues, data) -} - -func (t *jsonType) NewValues(values []byte, offsets []uint32) encoding.Values { - return byteArrayType{}.NewValues(values, offsets) -} - -func (t *jsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return byteArrayType{}.Encode(dst, src, enc) -} - -func (t *jsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return byteArrayType{}.Decode(dst, src, enc) -} - -func (t *jsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) -} - -func (t *jsonType) AssignValue(dst reflect.Value, src Value) error { - // Assign value using ByteArrayType for BC... - switch dst.Kind() { - case reflect.String: - return byteArrayType{}.AssignValue(dst, src) - case reflect.Slice: - if dst.Type().Elem().Kind() == reflect.Uint8 { - return byteArrayType{}.AssignValue(dst, src) - } - } - - // Otherwise handle with json.Unmarshal - b := src.byteArray() - val := reflect.New(dst.Type()).Elem() - err := json.Unmarshal(b, val.Addr().Interface()) - if err != nil { - return err - } - dst.Set(val) - return nil -} - -func (t *jsonType) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *byteArrayType, *stringType, *jsonType: - return val, nil - default: - return val, invalidConversion(val, "JSON", typ.String()) - } -} - -// BSON constructs a leaf node of BSON logical type. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#bson -func BSON() Node { return Leaf(&bsonType{}) } - -type bsonType format.BsonType - -func (t *bsonType) String() string { return (*format.BsonType)(t).String() } - -func (t *bsonType) Kind() Kind { return byteArrayType{}.Kind() } - -func (t *bsonType) Length() int { return byteArrayType{}.Length() } - -func (t *bsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } - -func (t *bsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } - -func (t *bsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) } - -func (t *bsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() } - -func (t *bsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() } - -func (t *bsonType) LogicalType() *format.LogicalType { - return &format.LogicalType{Bson: (*format.BsonType)(t)} -} - -func (t *bsonType) ConvertedType() *deprecated.ConvertedType { - return &convertedTypes[deprecated.Bson] -} - -func (t *bsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return byteArrayType{}.NewColumnIndexer(sizeLimit) -} - -func (t *bsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return byteArrayType{}.NewDictionary(columnIndex, numValues, data) -} - -func (t *bsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return byteArrayType{}.NewColumnBuffer(columnIndex, numValues) -} - -func (t *bsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return byteArrayType{}.NewPage(columnIndex, numValues, data) -} - -func (t *bsonType) NewValues(values []byte, offsets []uint32) encoding.Values { - return byteArrayType{}.NewValues(values, offsets) -} - -func (t *bsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return byteArrayType{}.Encode(dst, src, enc) -} - -func (t *bsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return byteArrayType{}.Decode(dst, src, enc) -} - -func (t *bsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) -} - -func (t *bsonType) AssignValue(dst reflect.Value, src Value) error { - return byteArrayType{}.AssignValue(dst, src) -} - -func (t *bsonType) ConvertValue(val Value, typ Type) (Value, error) { - switch typ.(type) { - case *byteArrayType, *bsonType: - return val, nil - default: - return val, invalidConversion(val, "BSON", typ.String()) - } -} - -// Date constructs a leaf node of DATE logical type. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date -func Date() Node { return Leaf(&dateType{}) } - -type dateType format.DateType - -func (t *dateType) String() string { return (*format.DateType)(t).String() } - -func (t *dateType) Kind() Kind { return int32Type{}.Kind() } - -func (t *dateType) Length() int { return int32Type{}.Length() } - -func (t *dateType) EstimateSize(n int) int { return int32Type{}.EstimateSize(n) } - -func (t *dateType) EstimateNumValues(n int) int { return int32Type{}.EstimateNumValues(n) } - -func (t *dateType) Compare(a, b Value) int { return int32Type{}.Compare(a, b) } - -func (t *dateType) ColumnOrder() *format.ColumnOrder { return int32Type{}.ColumnOrder() } - -func (t *dateType) PhysicalType() *format.Type { return int32Type{}.PhysicalType() } - -func (t *dateType) LogicalType() *format.LogicalType { - return &format.LogicalType{Date: (*format.DateType)(t)} -} - -func (t *dateType) ConvertedType() *deprecated.ConvertedType { - return &convertedTypes[deprecated.Date] -} - -func (t *dateType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return int32Type{}.NewColumnIndexer(sizeLimit) -} - -func (t *dateType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return int32Type{}.NewDictionary(columnIndex, numValues, data) -} - -func (t *dateType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return int32Type{}.NewColumnBuffer(columnIndex, numValues) -} - -func (t *dateType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return int32Type{}.NewPage(columnIndex, numValues, data) -} - -func (t *dateType) NewValues(values []byte, offsets []uint32) encoding.Values { - return int32Type{}.NewValues(values, offsets) -} - -func (t *dateType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return int32Type{}.Encode(dst, src, enc) -} - -func (t *dateType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return int32Type{}.Decode(dst, src, enc) -} - -func (t *dateType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return int32Type{}.EstimateDecodeSize(numValues, src, enc) -} - -func (t *dateType) AssignValue(dst reflect.Value, src Value) error { - return int32Type{}.AssignValue(dst, src) -} - -func (t *dateType) ConvertValue(val Value, typ Type) (Value, error) { - switch src := typ.(type) { - case *stringType: - return convertStringToDate(val, time.UTC) - case *timestampType: - return convertTimestampToDate(val, src.Unit, src.tz()) - } - return int32Type{}.ConvertValue(val, typ) -} - -// TimeUnit represents units of time in the parquet type system. -type TimeUnit interface { - // Returns the precision of the time unit as a time.Duration value. - Duration() time.Duration - // Converts the TimeUnit value to its representation in the parquet thrift - // format. - TimeUnit() format.TimeUnit -} - -var ( - Millisecond TimeUnit = &millisecond{} - Microsecond TimeUnit = µsecond{} - Nanosecond TimeUnit = &nanosecond{} -) - -type millisecond format.MilliSeconds - -func (u *millisecond) Duration() time.Duration { return time.Millisecond } -func (u *millisecond) TimeUnit() format.TimeUnit { - return format.TimeUnit{Millis: (*format.MilliSeconds)(u)} -} - -type microsecond format.MicroSeconds - -func (u *microsecond) Duration() time.Duration { return time.Microsecond } -func (u *microsecond) TimeUnit() format.TimeUnit { - return format.TimeUnit{Micros: (*format.MicroSeconds)(u)} -} - -type nanosecond format.NanoSeconds - -func (u *nanosecond) Duration() time.Duration { return time.Nanosecond } -func (u *nanosecond) TimeUnit() format.TimeUnit { - return format.TimeUnit{Nanos: (*format.NanoSeconds)(u)} -} - -// Time constructs a leaf node of TIME logical type. -// IsAdjustedToUTC is true by default. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time -func Time(unit TimeUnit) Node { - return TimeAdjusted(unit, true) -} - -// TimeAdjusted constructs a leaf node of TIME logical type -// with the IsAdjustedToUTC property explicitly set. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time -func TimeAdjusted(unit TimeUnit, isAdjustedToUTC bool) Node { - return Leaf(&timeType{IsAdjustedToUTC: isAdjustedToUTC, Unit: unit.TimeUnit()}) -} - -type timeType format.TimeType - -func (t *timeType) tz() *time.Location { - if t.IsAdjustedToUTC { - return time.UTC - } else { - return time.Local - } -} - -func (t *timeType) baseType() Type { - if t.useInt32() { - return int32Type{} - } else { - return int64Type{} - } -} - -func (t *timeType) useInt32() bool { return t.Unit.Millis != nil } - -func (t *timeType) useInt64() bool { return t.Unit.Micros != nil } - -func (t *timeType) String() string { return (*format.TimeType)(t).String() } - -func (t *timeType) Kind() Kind { return t.baseType().Kind() } - -func (t *timeType) Length() int { return t.baseType().Length() } - -func (t *timeType) EstimateSize(n int) int { return t.baseType().EstimateSize(n) } - -func (t *timeType) EstimateNumValues(n int) int { return t.baseType().EstimateNumValues(n) } - -func (t *timeType) Compare(a, b Value) int { return t.baseType().Compare(a, b) } - -func (t *timeType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() } - -func (t *timeType) PhysicalType() *format.Type { return t.baseType().PhysicalType() } - -func (t *timeType) LogicalType() *format.LogicalType { - return &format.LogicalType{Time: (*format.TimeType)(t)} -} - -func (t *timeType) ConvertedType() *deprecated.ConvertedType { - switch { - case t.useInt32(): - return &convertedTypes[deprecated.TimeMillis] - case t.useInt64(): - return &convertedTypes[deprecated.TimeMicros] - default: - return nil - } -} - -func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return t.baseType().NewColumnIndexer(sizeLimit) -} - -func (t *timeType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return t.baseType().NewColumnBuffer(columnIndex, numValues) -} - -func (t *timeType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return t.baseType().NewDictionary(columnIndex, numValues, data) -} - -func (t *timeType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return t.baseType().NewPage(columnIndex, numValues, data) -} - -func (t *timeType) NewValues(values []byte, offset []uint32) encoding.Values { - return t.baseType().NewValues(values, offset) -} - -func (t *timeType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return t.baseType().Encode(dst, src, enc) -} - -func (t *timeType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return t.baseType().Decode(dst, src, enc) -} - -func (t *timeType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return t.baseType().EstimateDecodeSize(numValues, src, enc) -} - -func (t *timeType) AssignValue(dst reflect.Value, src Value) error { - return t.baseType().AssignValue(dst, src) -} - -func (t *timeType) ConvertValue(val Value, typ Type) (Value, error) { - switch src := typ.(type) { - case *stringType: - tz := t.tz() - if t.Unit.Micros != nil { - return convertStringToTimeMicros(val, tz) - } else { - return convertStringToTimeMillis(val, tz) - } - case *timestampType: - tz := t.tz() - if t.Unit.Micros != nil { - return convertTimestampToTimeMicros(val, src.Unit, src.tz(), tz) - } else { - return convertTimestampToTimeMillis(val, src.Unit, src.tz(), tz) - } - } - return t.baseType().ConvertValue(val, typ) -} - -// Timestamp constructs of leaf node of TIMESTAMP logical type. -// IsAdjustedToUTC is true by default. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp -func Timestamp(unit TimeUnit) Node { - return TimestampAdjusted(unit, true) -} - -// TimestampAdjusted constructs a leaf node of TIMESTAMP logical type -// with the IsAdjustedToUTC property explicitly set. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time -func TimestampAdjusted(unit TimeUnit, isAdjustedToUTC bool) Node { - return Leaf(×tampType{IsAdjustedToUTC: isAdjustedToUTC, Unit: unit.TimeUnit()}) -} - -type timestampType format.TimestampType - -func (t *timestampType) tz() *time.Location { - if t.IsAdjustedToUTC { - return time.UTC - } else { - return time.Local - } -} - -func (t *timestampType) String() string { return (*format.TimestampType)(t).String() } - -func (t *timestampType) Kind() Kind { return int64Type{}.Kind() } - -func (t *timestampType) Length() int { return int64Type{}.Length() } - -func (t *timestampType) EstimateSize(n int) int { return int64Type{}.EstimateSize(n) } - -func (t *timestampType) EstimateNumValues(n int) int { return int64Type{}.EstimateNumValues(n) } - -func (t *timestampType) Compare(a, b Value) int { return int64Type{}.Compare(a, b) } - -func (t *timestampType) ColumnOrder() *format.ColumnOrder { return int64Type{}.ColumnOrder() } - -func (t *timestampType) PhysicalType() *format.Type { return int64Type{}.PhysicalType() } - -func (t *timestampType) LogicalType() *format.LogicalType { - return &format.LogicalType{Timestamp: (*format.TimestampType)(t)} -} - -func (t *timestampType) ConvertedType() *deprecated.ConvertedType { - switch { - case t.Unit.Millis != nil: - return &convertedTypes[deprecated.TimestampMillis] - case t.Unit.Micros != nil: - return &convertedTypes[deprecated.TimestampMicros] - default: - return nil - } -} - -func (t *timestampType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return int64Type{}.NewColumnIndexer(sizeLimit) -} - -func (t *timestampType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return int64Type{}.NewDictionary(columnIndex, numValues, data) -} - -func (t *timestampType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return int64Type{}.NewColumnBuffer(columnIndex, numValues) -} - -func (t *timestampType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return int64Type{}.NewPage(columnIndex, numValues, data) -} - -func (t *timestampType) NewValues(values []byte, offsets []uint32) encoding.Values { - return int64Type{}.NewValues(values, offsets) -} - -func (t *timestampType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return int64Type{}.Encode(dst, src, enc) -} - -func (t *timestampType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return int64Type{}.Decode(dst, src, enc) -} - -func (t *timestampType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return int64Type{}.EstimateDecodeSize(numValues, src, enc) -} - -func (t *timestampType) AssignValue(dst reflect.Value, src Value) error { - switch dst.Type() { - case reflect.TypeOf(time.Time{}): - unit := Nanosecond.TimeUnit() - lt := t.LogicalType() - if lt != nil && lt.Timestamp != nil { - unit = lt.Timestamp.Unit - } - - nanos := src.int64() - switch { - case unit.Millis != nil: - nanos = nanos * 1e6 - case unit.Micros != nil: - nanos = nanos * 1e3 - } - - val := time.Unix(0, nanos).UTC() - dst.Set(reflect.ValueOf(val)) - return nil - default: - return int64Type{}.AssignValue(dst, src) - } -} - -func (t *timestampType) ConvertValue(val Value, typ Type) (Value, error) { - switch src := typ.(type) { - case *timestampType: - return convertTimestampToTimestamp(val, src.Unit, t.Unit) - case *dateType: - return convertDateToTimestamp(val, t.Unit, t.tz()) - } - return int64Type{}.ConvertValue(val, typ) -} - -// List constructs a node of LIST logical type. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists -func List(of Node) Node { - return listNode{Group{"list": Repeated(Group{"element": of})}} -} - -type listNode struct{ Group } - -func (listNode) Type() Type { return &listType{} } - -type listType format.ListType - -func (t *listType) String() string { return (*format.ListType)(t).String() } - -func (t *listType) Kind() Kind { panic("cannot call Kind on parquet LIST type") } - -func (t *listType) Length() int { return 0 } - -func (t *listType) EstimateSize(int) int { return 0 } - -func (t *listType) EstimateNumValues(int) int { return 0 } - -func (t *listType) Compare(Value, Value) int { panic("cannot compare values on parquet LIST type") } - -func (t *listType) ColumnOrder() *format.ColumnOrder { return nil } - -func (t *listType) PhysicalType() *format.Type { return nil } - -func (t *listType) LogicalType() *format.LogicalType { - return &format.LogicalType{List: (*format.ListType)(t)} -} - -func (t *listType) ConvertedType() *deprecated.ConvertedType { - return &convertedTypes[deprecated.List] -} - -func (t *listType) NewColumnIndexer(int) ColumnIndexer { - panic("create create column indexer from parquet LIST type") -} - -func (t *listType) NewDictionary(int, int, encoding.Values) Dictionary { - panic("cannot create dictionary from parquet LIST type") -} - -func (t *listType) NewColumnBuffer(int, int) ColumnBuffer { - panic("cannot create column buffer from parquet LIST type") -} - -func (t *listType) NewPage(int, int, encoding.Values) Page { - panic("cannot create page from parquet LIST type") -} - -func (t *listType) NewValues(values []byte, _ []uint32) encoding.Values { - panic("cannot create values from parquet LIST type") -} - -func (t *listType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { - panic("cannot encode parquet LIST type") -} - -func (t *listType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { - panic("cannot decode parquet LIST type") -} - -func (t *listType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { - panic("cannot estimate decode size of parquet LIST type") -} - -func (t *listType) AssignValue(reflect.Value, Value) error { - panic("cannot assign value to a parquet LIST type") -} - -func (t *listType) ConvertValue(Value, Type) (Value, error) { - panic("cannot convert value to a parquet LIST type") -} - -// Map constructs a node of MAP logical type. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps -func Map(key, value Node) Node { - return mapNode{Group{ - "key_value": Repeated(Group{ - "key": Required(key), - "value": value, - }), - }} -} - -type mapNode struct{ Group } - -func (mapNode) Type() Type { return &mapType{} } - -type mapType format.MapType - -func (t *mapType) String() string { return (*format.MapType)(t).String() } - -func (t *mapType) Kind() Kind { panic("cannot call Kind on parquet MAP type") } - -func (t *mapType) Length() int { return 0 } - -func (t *mapType) EstimateSize(int) int { return 0 } - -func (t *mapType) EstimateNumValues(int) int { return 0 } - -func (t *mapType) Compare(Value, Value) int { panic("cannot compare values on parquet MAP type") } - -func (t *mapType) ColumnOrder() *format.ColumnOrder { return nil } - -func (t *mapType) PhysicalType() *format.Type { return nil } - -func (t *mapType) LogicalType() *format.LogicalType { - return &format.LogicalType{Map: (*format.MapType)(t)} -} - -func (t *mapType) ConvertedType() *deprecated.ConvertedType { - return &convertedTypes[deprecated.Map] -} - -func (t *mapType) NewColumnIndexer(int) ColumnIndexer { - panic("create create column indexer from parquet MAP type") -} - -func (t *mapType) NewDictionary(int, int, encoding.Values) Dictionary { - panic("cannot create dictionary from parquet MAP type") -} - -func (t *mapType) NewColumnBuffer(int, int) ColumnBuffer { - panic("cannot create column buffer from parquet MAP type") -} - -func (t *mapType) NewPage(int, int, encoding.Values) Page { - panic("cannot create page from parquet MAP type") -} - -func (t *mapType) NewValues(values []byte, _ []uint32) encoding.Values { - panic("cannot create values from parquet MAP type") -} - -func (t *mapType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { - panic("cannot encode parquet MAP type") -} - -func (t *mapType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { - panic("cannot decode parquet MAP type") -} - -func (t *mapType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { - panic("cannot estimate decode size of parquet MAP type") -} - -func (t *mapType) AssignValue(reflect.Value, Value) error { - panic("cannot assign value to a parquet MAP type") -} - -func (t *mapType) ConvertValue(Value, Type) (Value, error) { - panic("cannot convert value to a parquet MAP type") -} - -type nullType format.NullType - -func (t *nullType) String() string { return (*format.NullType)(t).String() } - -func (t *nullType) Kind() Kind { return -1 } - -func (t *nullType) Length() int { return 0 } - -func (t *nullType) EstimateSize(int) int { return 0 } - -func (t *nullType) EstimateNumValues(int) int { return 0 } - -func (t *nullType) Compare(Value, Value) int { panic("cannot compare values on parquet NULL type") } - -func (t *nullType) ColumnOrder() *format.ColumnOrder { return nil } - -func (t *nullType) PhysicalType() *format.Type { return nil } - -func (t *nullType) LogicalType() *format.LogicalType { - return &format.LogicalType{Unknown: (*format.NullType)(t)} -} - -func (t *nullType) ConvertedType() *deprecated.ConvertedType { return nil } - -func (t *nullType) NewColumnIndexer(int) ColumnIndexer { - panic("create create column indexer from parquet NULL type") -} - -func (t *nullType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newNullDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) -} - -func (t *nullType) NewColumnBuffer(int, int) ColumnBuffer { - panic("cannot create column buffer from parquet NULL type") -} - -func (t *nullType) NewPage(columnIndex, numValues int, _ encoding.Values) Page { - return newNullPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) -} - -func (t *nullType) NewValues(_ []byte, _ []uint32) encoding.Values { - return encoding.Values{} -} - -func (t *nullType) Encode(dst []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { - return dst[:0], nil -} - -func (t *nullType) Decode(dst encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { - return dst, nil -} - -func (t *nullType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { - return 0 -} - -func (t *nullType) AssignValue(reflect.Value, Value) error { - return nil -} - -func (t *nullType) ConvertValue(val Value, _ Type) (Value, error) { - return val, nil -} - -// Variant constructs a node of unshredded VARIANT logical type. It is a group with -// two required fields, "metadata" and "value", both byte arrays. -// -// Experimental: The specification for variants is still being developed and the type -// is not fully adopted. Support for this type is subject to change. -// -// Initial support does not attempt to process the variant data. So reading and writing -// data of this type behaves as if it were just a group with two byte array fields, as -// if the logical type annotation were absent. This may change in the future. -// -// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#variant -func Variant() Node { - return variantNode{Group{"metadata": Required(Leaf(ByteArrayType)), "value": Required(Leaf(ByteArrayType))}} -} - -// TODO: add ShreddedVariant(Node) function, to create a shredded variant -// where the argument defines the type/structure of the shredded value(s). - -type variantNode struct{ Group } - -func (variantNode) Type() Type { return &variantType{} } - -type variantType format.VariantType - -func (t *variantType) String() string { return (*format.VariantType)(t).String() } - -func (t *variantType) Kind() Kind { panic("cannot call Kind on parquet VARIANT type") } - -func (t *variantType) Length() int { return 0 } - -func (t *variantType) EstimateSize(int) int { return 0 } - -func (t *variantType) EstimateNumValues(int) int { return 0 } - -func (t *variantType) Compare(Value, Value) int { - panic("cannot compare values on parquet VARIANT type") -} - -func (t *variantType) ColumnOrder() *format.ColumnOrder { return nil } - -func (t *variantType) PhysicalType() *format.Type { return nil } - -func (t *variantType) LogicalType() *format.LogicalType { - return &format.LogicalType{Variant: (*format.VariantType)(t)} -} - -func (t *variantType) ConvertedType() *deprecated.ConvertedType { return nil } - -func (t *variantType) NewColumnIndexer(int) ColumnIndexer { - panic("create create column indexer from parquet VARIANT type") -} - -func (t *variantType) NewDictionary(int, int, encoding.Values) Dictionary { - panic("cannot create dictionary from parquet VARIANT type") -} - -func (t *variantType) NewColumnBuffer(int, int) ColumnBuffer { - panic("cannot create column buffer from parquet VARIANT type") -} - -func (t *variantType) NewPage(int, int, encoding.Values) Page { - panic("cannot create page from parquet VARIANT type") -} - -func (t *variantType) NewValues(values []byte, _ []uint32) encoding.Values { - panic("cannot create values from parquet VARIANT type") -} - -func (t *variantType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { - panic("cannot encode parquet VARIANT type") -} - -func (t *variantType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { - panic("cannot decode parquet VARIANT type") -} - -func (t *variantType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { - panic("cannot estimate decode size of parquet VARIANT type") -} - -func (t *variantType) AssignValue(reflect.Value, Value) error { - panic("cannot assign value to a parquet VARIANT type") -} - -func (t *variantType) ConvertValue(Value, Type) (Value, error) { - panic("cannot convert value to a parquet VARIANT type") -} - -type groupType struct{} - -func (groupType) String() string { return "group" } - -func (groupType) Kind() Kind { - panic("cannot call Kind on parquet group") -} - -func (groupType) Compare(Value, Value) int { - panic("cannot compare values on parquet group") -} - -func (groupType) NewColumnIndexer(int) ColumnIndexer { - panic("cannot create column indexer from parquet group") -} - -func (groupType) NewDictionary(int, int, encoding.Values) Dictionary { - panic("cannot create dictionary from parquet group") -} - -func (t groupType) NewColumnBuffer(int, int) ColumnBuffer { - panic("cannot create column buffer from parquet group") -} - -func (t groupType) NewPage(int, int, encoding.Values) Page { - panic("cannot create page from parquet group") -} - -func (t groupType) NewValues(_ []byte, _ []uint32) encoding.Values { - panic("cannot create values from parquet group") -} - -func (groupType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { - panic("cannot encode parquet group") -} - -func (groupType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { - panic("cannot decode parquet group") -} - -func (groupType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { - panic("cannot estimate decode size of parquet group") -} - -func (groupType) AssignValue(reflect.Value, Value) error { - panic("cannot assign value to a parquet group") -} - -func (t groupType) ConvertValue(Value, Type) (Value, error) { - panic("cannot convert value to a parquet group") -} - -func (groupType) Length() int { return 0 } - -func (groupType) EstimateSize(int) int { return 0 } - -func (groupType) EstimateNumValues(int) int { return 0 } - -func (groupType) ColumnOrder() *format.ColumnOrder { return nil } - -func (groupType) PhysicalType() *format.Type { return nil } - -func (groupType) LogicalType() *format.LogicalType { return nil } - -func (groupType) ConvertedType() *deprecated.ConvertedType { return nil } - -func checkTypeKindEqual(to, from Type) error { - if to.Kind() != from.Kind() { - return fmt.Errorf("cannot convert from parquet value of type %s to %s", from, to) - } - return nil -} diff --git a/vendor/github.com/parquet-go/parquet-go/type_boolean.go b/vendor/github.com/parquet-go/parquet-go/type_boolean.go new file mode 100644 index 00000000000..bb173ff2629 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_boolean.go @@ -0,0 +1,90 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type booleanType struct{} + +func (t booleanType) String() string { return "BOOLEAN" } +func (t booleanType) Kind() Kind { return Boolean } +func (t booleanType) Length() int { return 1 } +func (t booleanType) EstimateSize(n int) int { return (n + 7) / 8 } +func (t booleanType) EstimateNumValues(n int) int { return 8 * n } +func (t booleanType) Compare(a, b Value) int { return compareBool(a.boolean(), b.boolean()) } +func (t booleanType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } +func (t booleanType) LogicalType() *format.LogicalType { return nil } +func (t booleanType) ConvertedType() *deprecated.ConvertedType { return nil } +func (t booleanType) PhysicalType() *format.Type { return &physicalTypes[Boolean] } + +func (t booleanType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newBooleanColumnIndexer() +} + +func (t booleanType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newBooleanColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t booleanType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newBooleanDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t booleanType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newBooleanPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t booleanType) NewValues(values []byte, _ []uint32) encoding.Values { + return encoding.BooleanValues(values) +} + +func (t booleanType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeBoolean(dst, src, enc) +} + +func (t booleanType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeBoolean(dst, src, enc) +} + +func (t booleanType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.EstimateSize(numValues) +} + +func (t booleanType) AssignValue(dst reflect.Value, src Value) error { + v := src.boolean() + switch dst.Kind() { + case reflect.Bool: + dst.SetBool(v) + default: + dst.Set(reflect.ValueOf(v)) + } + return nil +} + +func (t booleanType) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *stringType: + return convertStringToBoolean(val) + } + switch typ.Kind() { + case Boolean: + return val, nil + case Int32: + return convertInt32ToBoolean(val) + case Int64: + return convertInt64ToBoolean(val) + case Int96: + return convertInt96ToBoolean(val) + case Float: + return convertFloatToBoolean(val) + case Double: + return convertDoubleToBoolean(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToBoolean(val) + default: + return makeValueKind(Boolean), nil + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_bson.go b/vendor/github.com/parquet-go/parquet-go/type_bson.go new file mode 100644 index 00000000000..fcf9117772d --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_bson.go @@ -0,0 +1,87 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// BSON constructs a leaf node of BSON logical type. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#bson +func BSON() Node { return Leaf(&bsonType{}) } + +var bsonLogicalType = format.LogicalType{ + Bson: new(format.BsonType), +} + +type bsonType format.BsonType + +func (t *bsonType) String() string { return (*format.BsonType)(t).String() } + +func (t *bsonType) Kind() Kind { return byteArrayType{}.Kind() } + +func (t *bsonType) Length() int { return byteArrayType{}.Length() } + +func (t *bsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } + +func (t *bsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } + +func (t *bsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) } + +func (t *bsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() } + +func (t *bsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() } + +func (t *bsonType) LogicalType() *format.LogicalType { return &bsonLogicalType } + +func (t *bsonType) ConvertedType() *deprecated.ConvertedType { + return &convertedTypes[deprecated.Bson] +} + +func (t *bsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return byteArrayType{}.NewColumnIndexer(sizeLimit) +} + +func (t *bsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return byteArrayType{}.NewDictionary(columnIndex, numValues, data) +} + +func (t *bsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return byteArrayType{}.NewColumnBuffer(columnIndex, numValues) +} + +func (t *bsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return byteArrayType{}.NewPage(columnIndex, numValues, data) +} + +func (t *bsonType) NewValues(values []byte, offsets []uint32) encoding.Values { + return byteArrayType{}.NewValues(values, offsets) +} + +func (t *bsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return byteArrayType{}.Encode(dst, src, enc) +} + +func (t *bsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return byteArrayType{}.Decode(dst, src, enc) +} + +func (t *bsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) +} + +func (t *bsonType) AssignValue(dst reflect.Value, src Value) error { + return byteArrayType{}.AssignValue(dst, src) +} + +func (t *bsonType) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *byteArrayType, *bsonType: + return val, nil + default: + return val, invalidConversion(val, "BSON", typ.String()) + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_byte_array.go b/vendor/github.com/parquet-go/parquet-go/type_byte_array.go new file mode 100644 index 00000000000..b15492bac62 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_byte_array.go @@ -0,0 +1,90 @@ +package parquet + +import ( + "bytes" + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type byteArrayType struct{} + +func (t byteArrayType) String() string { return "BYTE_ARRAY" } +func (t byteArrayType) Kind() Kind { return ByteArray } +func (t byteArrayType) Length() int { return 0 } +func (t byteArrayType) EstimateSize(n int) int { return estimatedSizeOfByteArrayValues * n } +func (t byteArrayType) EstimateNumValues(n int) int { return n / estimatedSizeOfByteArrayValues } +func (t byteArrayType) Compare(a, b Value) int { return bytes.Compare(a.byteArray(), b.byteArray()) } +func (t byteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } +func (t byteArrayType) LogicalType() *format.LogicalType { return nil } +func (t byteArrayType) ConvertedType() *deprecated.ConvertedType { return nil } +func (t byteArrayType) PhysicalType() *format.Type { return &physicalTypes[ByteArray] } + +func (t byteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newByteArrayColumnIndexer(sizeLimit) +} + +func (t byteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t byteArrayType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t byteArrayType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t byteArrayType) NewValues(values []byte, offsets []uint32) encoding.Values { + return encoding.ByteArrayValues(values, offsets) +} + +func (t byteArrayType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeByteArray(dst, src, enc) +} + +func (t byteArrayType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeByteArray(dst, src, enc) +} + +func (t byteArrayType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return enc.EstimateDecodeByteArraySize(src) +} + +func (t byteArrayType) AssignValue(dst reflect.Value, src Value) error { + v := src.byteArray() + switch dst.Kind() { + case reflect.String: + dst.SetString(string(v)) + case reflect.Slice: + dst.SetBytes(copyBytes(v)) + default: + val := reflect.ValueOf(string(v)) + dst.Set(val) + } + return nil +} + +func (t byteArrayType) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.Kind() { + case Boolean: + return convertBooleanToByteArray(val) + case Int32: + return convertInt32ToByteArray(val) + case Int64: + return convertInt64ToByteArray(val) + case Int96: + return convertInt96ToByteArray(val) + case Float: + return convertFloatToByteArray(val) + case Double: + return convertDoubleToByteArray(val) + case ByteArray, FixedLenByteArray: + return val, nil + default: + return makeValueKind(ByteArray), nil + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_date.go b/vendor/github.com/parquet-go/parquet-go/type_date.go new file mode 100644 index 00000000000..835821915af --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_date.go @@ -0,0 +1,89 @@ +package parquet + +import ( + "reflect" + "time" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// Date constructs a leaf node of DATE logical type. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date +func Date() Node { return Leaf(&dateType{}) } + +var dateLogicalType = format.LogicalType{ + Date: new(format.DateType), +} + +type dateType format.DateType + +func (t *dateType) String() string { return (*format.DateType)(t).String() } + +func (t *dateType) Kind() Kind { return int32Type{}.Kind() } + +func (t *dateType) Length() int { return int32Type{}.Length() } + +func (t *dateType) EstimateSize(n int) int { return int32Type{}.EstimateSize(n) } + +func (t *dateType) EstimateNumValues(n int) int { return int32Type{}.EstimateNumValues(n) } + +func (t *dateType) Compare(a, b Value) int { return int32Type{}.Compare(a, b) } + +func (t *dateType) ColumnOrder() *format.ColumnOrder { return int32Type{}.ColumnOrder() } + +func (t *dateType) PhysicalType() *format.Type { return int32Type{}.PhysicalType() } + +func (t *dateType) LogicalType() *format.LogicalType { return &dateLogicalType } + +func (t *dateType) ConvertedType() *deprecated.ConvertedType { + return &convertedTypes[deprecated.Date] +} + +func (t *dateType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return int32Type{}.NewColumnIndexer(sizeLimit) +} + +func (t *dateType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return int32Type{}.NewDictionary(columnIndex, numValues, data) +} + +func (t *dateType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return int32Type{}.NewColumnBuffer(columnIndex, numValues) +} + +func (t *dateType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return int32Type{}.NewPage(columnIndex, numValues, data) +} + +func (t *dateType) NewValues(values []byte, offsets []uint32) encoding.Values { + return int32Type{}.NewValues(values, offsets) +} + +func (t *dateType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return int32Type{}.Encode(dst, src, enc) +} + +func (t *dateType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return int32Type{}.Decode(dst, src, enc) +} + +func (t *dateType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return int32Type{}.EstimateDecodeSize(numValues, src, enc) +} + +func (t *dateType) AssignValue(dst reflect.Value, src Value) error { + return int32Type{}.AssignValue(dst, src) +} + +func (t *dateType) ConvertValue(val Value, typ Type) (Value, error) { + switch src := typ.(type) { + case *stringType: + return convertStringToDate(val, time.UTC) + case *timestampType: + return convertTimestampToDate(val, src.Unit, src.tz()) + } + return int32Type{}.ConvertValue(val, typ) +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_decimal.go b/vendor/github.com/parquet-go/parquet-go/type_decimal.go new file mode 100644 index 00000000000..d519b078b15 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_decimal.go @@ -0,0 +1,40 @@ +package parquet + +import ( + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/format" +) + +// Decimal constructs a leaf node of decimal logical type with the given +// scale, precision, and underlying type. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal +func Decimal(scale, precision int, typ Type) Node { + switch typ.Kind() { + case Int32, Int64, FixedLenByteArray: + default: + panic("DECIMAL node must annotate Int32, Int64 or FixedLenByteArray but got " + typ.String()) + } + return Leaf(&decimalType{ + decimal: format.DecimalType{ + Scale: int32(scale), + Precision: int32(precision), + }, + Type: typ, + }) +} + +type decimalType struct { + decimal format.DecimalType + Type +} + +func (t *decimalType) String() string { return t.decimal.String() } + +func (t *decimalType) LogicalType() *format.LogicalType { + return &format.LogicalType{Decimal: &t.decimal} +} + +func (t *decimalType) ConvertedType() *deprecated.ConvertedType { + return &convertedTypes[deprecated.Decimal] +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_double.go b/vendor/github.com/parquet-go/parquet-go/type_double.go new file mode 100644 index 00000000000..a94ed8654f8 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_double.go @@ -0,0 +1,90 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type doubleType struct{} + +func (t doubleType) String() string { return "DOUBLE" } +func (t doubleType) Kind() Kind { return Double } +func (t doubleType) Length() int { return 64 } +func (t doubleType) EstimateSize(n int) int { return 8 * n } +func (t doubleType) EstimateNumValues(n int) int { return n / 8 } +func (t doubleType) Compare(a, b Value) int { return compareFloat64(a.double(), b.double()) } +func (t doubleType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } +func (t doubleType) LogicalType() *format.LogicalType { return nil } +func (t doubleType) ConvertedType() *deprecated.ConvertedType { return nil } +func (t doubleType) PhysicalType() *format.Type { return &physicalTypes[Double] } + +func (t doubleType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newDoubleColumnIndexer() +} + +func (t doubleType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newDoubleColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t doubleType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newDoubleDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t doubleType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newDoublePage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t doubleType) NewValues(values []byte, _ []uint32) encoding.Values { + return encoding.DoubleValuesFromBytes(values) +} + +func (t doubleType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeDouble(dst, src, enc) +} + +func (t doubleType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeDouble(dst, src, enc) +} + +func (t doubleType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.EstimateSize(numValues) +} + +func (t doubleType) AssignValue(dst reflect.Value, src Value) error { + v := src.double() + switch dst.Kind() { + case reflect.Float32, reflect.Float64: + dst.SetFloat(v) + default: + dst.Set(reflect.ValueOf(v)) + } + return nil +} + +func (t doubleType) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *stringType: + return convertStringToDouble(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToDouble(val) + case Int32: + return convertInt32ToDouble(val) + case Int64: + return convertInt64ToDouble(val) + case Int96: + return convertInt96ToDouble(val) + case Float: + return convertFloatToDouble(val) + case Double: + return val, nil + case ByteArray, FixedLenByteArray: + return convertByteArrayToDouble(val) + default: + return makeValueKind(Double), nil + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_enum.go b/vendor/github.com/parquet-go/parquet-go/type_enum.go new file mode 100644 index 00000000000..1688d5a0567 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_enum.go @@ -0,0 +1,87 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// Enum constructs a leaf node with a logical type representing enumerations. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#enum +func Enum() Node { return Leaf(&enumType{}) } + +var enumLogicalType = format.LogicalType{ + Enum: new(format.EnumType), +} + +type enumType format.EnumType + +func (t *enumType) String() string { return (*format.EnumType)(t).String() } + +func (t *enumType) Kind() Kind { return new(stringType).Kind() } + +func (t *enumType) Length() int { return new(stringType).Length() } + +func (t *enumType) EstimateSize(n int) int { return new(stringType).EstimateSize(n) } + +func (t *enumType) EstimateNumValues(n int) int { return new(stringType).EstimateNumValues(n) } + +func (t *enumType) Compare(a, b Value) int { return new(stringType).Compare(a, b) } + +func (t *enumType) ColumnOrder() *format.ColumnOrder { return new(stringType).ColumnOrder() } + +func (t *enumType) PhysicalType() *format.Type { return new(stringType).PhysicalType() } + +func (t *enumType) LogicalType() *format.LogicalType { return &enumLogicalType } + +func (t *enumType) ConvertedType() *deprecated.ConvertedType { + return &convertedTypes[deprecated.Enum] +} + +func (t *enumType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return new(stringType).NewColumnIndexer(sizeLimit) +} + +func (t *enumType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return new(stringType).NewDictionary(columnIndex, numValues, data) +} + +func (t *enumType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return new(stringType).NewColumnBuffer(columnIndex, numValues) +} + +func (t *enumType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return new(stringType).NewPage(columnIndex, numValues, data) +} + +func (t *enumType) NewValues(values []byte, offsets []uint32) encoding.Values { + return new(stringType).NewValues(values, offsets) +} + +func (t *enumType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return new(stringType).Encode(dst, src, enc) +} + +func (t *enumType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return new(stringType).Decode(dst, src, enc) +} + +func (t *enumType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return new(stringType).EstimateDecodeSize(numValues, src, enc) +} + +func (t *enumType) AssignValue(dst reflect.Value, src Value) error { + return new(stringType).AssignValue(dst, src) +} + +func (t *enumType) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *byteArrayType, *stringType, *enumType: + return val, nil + default: + return val, invalidConversion(val, "ENUM", typ.String()) + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_fixed_len_byte_array.go b/vendor/github.com/parquet-go/parquet-go/type_fixed_len_byte_array.go new file mode 100644 index 00000000000..43b58c1dd2c --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_fixed_len_byte_array.go @@ -0,0 +1,234 @@ +package parquet + +import ( + "bytes" + "fmt" + "reflect" + "unsafe" + + "github.com/google/uuid" + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type fixedLenByteArrayType struct { + length int + isUUID bool +} + +func (t fixedLenByteArrayType) String() string { + return fmt.Sprintf("FIXED_LEN_BYTE_ARRAY(%d)", t.length) +} + +func (t fixedLenByteArrayType) Kind() Kind { return FixedLenByteArray } + +func (t fixedLenByteArrayType) Length() int { return t.length } + +func (t fixedLenByteArrayType) EstimateSize(n int) int { return t.length * n } + +func (t fixedLenByteArrayType) EstimateNumValues(n int) int { return n / t.length } + +func (t fixedLenByteArrayType) Compare(a, b Value) int { + return bytes.Compare(a.byteArray(), b.byteArray()) +} + +func (t fixedLenByteArrayType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } + +func (t fixedLenByteArrayType) LogicalType() *format.LogicalType { return nil } + +func (t fixedLenByteArrayType) ConvertedType() *deprecated.ConvertedType { return nil } + +func (t fixedLenByteArrayType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } + +func (t fixedLenByteArrayType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newFixedLenByteArrayColumnIndexer(t.length, sizeLimit) +} + +func (t fixedLenByteArrayType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newFixedLenByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t fixedLenByteArrayType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newFixedLenByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t fixedLenByteArrayType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newFixedLenByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t fixedLenByteArrayType) NewValues(values []byte, _ []uint32) encoding.Values { + return encoding.FixedLenByteArrayValues(values, t.length) +} + +func (t fixedLenByteArrayType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeFixedLenByteArray(dst, src, enc) +} + +func (t fixedLenByteArrayType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeFixedLenByteArray(dst, src, enc) +} + +func (t fixedLenByteArrayType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.EstimateSize(numValues) +} + +func (t fixedLenByteArrayType) AssignValue(dst reflect.Value, src Value) error { + v := src.byteArray() + switch dst.Kind() { + case reflect.Array: + if dst.Type().Elem().Kind() == reflect.Uint8 && dst.Len() == len(v) { + // This code could be implemented as a call to reflect.Copy but + // it would require creating a reflect.Value from v which causes + // the heap allocation to pack the []byte value. To avoid this + // overhead we instead convert the reflect.Value holding the + // destination array into a byte slice which allows us to use + // a more efficient call to copy. + d := unsafe.Slice((*byte)(reflectValueData(dst)), len(v)) + copy(d, v) + return nil + } + case reflect.Slice: + dst.SetBytes(copyBytes(v)) + return nil + case reflect.String: + if t.isUUID { + dst.SetString(uuid.UUID(v).String()) + return nil + } + } + + val := reflect.ValueOf(copyBytes(v)) + dst.Set(val) + return nil +} + +func reflectValueData(v reflect.Value) unsafe.Pointer { + return (*[2]unsafe.Pointer)(unsafe.Pointer(&v))[1] +} + +func reflectValuePointer(v reflect.Value) unsafe.Pointer { + if v.Kind() == reflect.Map { + // Map values are inlined in the reflect.Value data area, + // because they are a reference type and their paointer is + // packed in the interface. However, we need to get an + // address to the pointer itself, so we extract it and + // return the address of this pointer. It causes a heap + // allocation, which is unfortunate, an we would probably + // want to optimize away eventually. + p := v.UnsafePointer() + return unsafe.Pointer(&p) + } + return reflectValueData(v) +} + +func (t fixedLenByteArrayType) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *stringType: + return convertStringToFixedLenByteArray(val, t.length) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToFixedLenByteArray(val, t.length) + case Int32: + return convertInt32ToFixedLenByteArray(val, t.length) + case Int64: + return convertInt64ToFixedLenByteArray(val, t.length) + case Int96: + return convertInt96ToFixedLenByteArray(val, t.length) + case Float: + return convertFloatToFixedLenByteArray(val, t.length) + case Double: + return convertDoubleToFixedLenByteArray(val, t.length) + case ByteArray, FixedLenByteArray: + return convertByteArrayToFixedLenByteArray(val, t.length) + default: + return makeValueBytes(FixedLenByteArray, make([]byte, t.length)), nil + } +} + +// BE128 stands for "big-endian 128 bits". This type is used as a special case +// for fixed-length byte arrays of 16 bytes, which are commonly used to +// represent columns of random unique identifiers such as UUIDs. +// +// Comparisons of BE128 values use the natural byte order, the zeroth byte is +// the most significant byte. +// +// The special case is intended to provide optimizations based on the knowledge +// that the values are 16 bytes long. Stronger type checking can also be applied +// by the compiler when using [16]byte values rather than []byte, reducing the +// risk of errors on these common code paths. +type be128Type struct { + isUUID bool +} + +func (t be128Type) String() string { return "FIXED_LEN_BYTE_ARRAY(16)" } + +func (t be128Type) Kind() Kind { return FixedLenByteArray } + +func (t be128Type) Length() int { return 16 } + +func (t be128Type) EstimateSize(n int) int { return 16 * n } + +func (t be128Type) EstimateNumValues(n int) int { return n / 16 } + +func (t be128Type) Compare(a, b Value) int { return compareBE128(a.be128(), b.be128()) } + +func (t be128Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } + +func (t be128Type) LogicalType() *format.LogicalType { return nil } + +func (t be128Type) ConvertedType() *deprecated.ConvertedType { return nil } + +func (t be128Type) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } + +func (t be128Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newBE128ColumnIndexer() +} + +func (t be128Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newBE128ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t be128Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newBE128Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t be128Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newBE128Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t be128Type) NewValues(values []byte, _ []uint32) encoding.Values { + return encoding.FixedLenByteArrayValues(values, 16) +} + +func (t be128Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeFixedLenByteArray(dst, src, enc) +} + +func (t be128Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeFixedLenByteArray(dst, src, enc) +} + +func (t be128Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.EstimateSize(numValues) +} + +func (t be128Type) AssignValue(dst reflect.Value, src Value) error { + return fixedLenByteArrayType{length: 16, isUUID: t.isUUID}.AssignValue(dst, src) +} + +func (t be128Type) ConvertValue(val Value, typ Type) (Value, error) { + return fixedLenByteArrayType{length: 16, isUUID: t.isUUID}.ConvertValue(val, typ) +} + +// FixedLenByteArrayType constructs a type for fixed-length values of the given +// size (in bytes). +func FixedLenByteArrayType(length int) Type { + switch length { + case 16: + return be128Type{} + default: + return fixedLenByteArrayType{length: length} + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_float.go b/vendor/github.com/parquet-go/parquet-go/type_float.go new file mode 100644 index 00000000000..10420562127 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_float.go @@ -0,0 +1,90 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type floatType struct{} + +func (t floatType) String() string { return "FLOAT" } +func (t floatType) Kind() Kind { return Float } +func (t floatType) Length() int { return 32 } +func (t floatType) EstimateSize(n int) int { return 4 * n } +func (t floatType) EstimateNumValues(n int) int { return n / 4 } +func (t floatType) Compare(a, b Value) int { return compareFloat32(a.float(), b.float()) } +func (t floatType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } +func (t floatType) LogicalType() *format.LogicalType { return nil } +func (t floatType) ConvertedType() *deprecated.ConvertedType { return nil } +func (t floatType) PhysicalType() *format.Type { return &physicalTypes[Float] } + +func (t floatType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newFloatColumnIndexer() +} + +func (t floatType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newFloatColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t floatType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newFloatDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t floatType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newFloatPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t floatType) NewValues(values []byte, _ []uint32) encoding.Values { + return encoding.FloatValuesFromBytes(values) +} + +func (t floatType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeFloat(dst, src, enc) +} + +func (t floatType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeFloat(dst, src, enc) +} + +func (t floatType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.EstimateSize(numValues) +} + +func (t floatType) AssignValue(dst reflect.Value, src Value) error { + v := src.float() + switch dst.Kind() { + case reflect.Float32, reflect.Float64: + dst.SetFloat(float64(v)) + default: + dst.Set(reflect.ValueOf(v)) + } + return nil +} + +func (t floatType) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *stringType: + return convertStringToFloat(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToFloat(val) + case Int32: + return convertInt32ToFloat(val) + case Int64: + return convertInt64ToFloat(val) + case Int96: + return convertInt96ToFloat(val) + case Float: + return val, nil + case Double: + return convertDoubleToFloat(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToFloat(val) + default: + return makeValueKind(Float), nil + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_group.go b/vendor/github.com/parquet-go/parquet-go/type_group.go new file mode 100644 index 00000000000..df9610a9a50 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_group.go @@ -0,0 +1,75 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type groupType struct{} + +func (groupType) String() string { return "group" } + +func (groupType) Kind() Kind { + panic("cannot call Kind on parquet group") +} + +func (groupType) Compare(Value, Value) int { + panic("cannot compare values on parquet group") +} + +func (groupType) NewColumnIndexer(int) ColumnIndexer { + panic("cannot create column indexer from parquet group") +} + +func (groupType) NewDictionary(int, int, encoding.Values) Dictionary { + panic("cannot create dictionary from parquet group") +} + +func (t groupType) NewColumnBuffer(int, int) ColumnBuffer { + panic("cannot create column buffer from parquet group") +} + +func (t groupType) NewPage(int, int, encoding.Values) Page { + panic("cannot create page from parquet group") +} + +func (t groupType) NewValues(_ []byte, _ []uint32) encoding.Values { + panic("cannot create values from parquet group") +} + +func (groupType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { + panic("cannot encode parquet group") +} + +func (groupType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { + panic("cannot decode parquet group") +} + +func (groupType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { + panic("cannot estimate decode size of parquet group") +} + +func (groupType) AssignValue(reflect.Value, Value) error { + panic("cannot assign value to a parquet group") +} + +func (t groupType) ConvertValue(Value, Type) (Value, error) { + panic("cannot convert value to a parquet group") +} + +func (groupType) Length() int { return 0 } + +func (groupType) EstimateSize(int) int { return 0 } + +func (groupType) EstimateNumValues(int) int { return 0 } + +func (groupType) ColumnOrder() *format.ColumnOrder { return nil } + +func (groupType) PhysicalType() *format.Type { return nil } + +func (groupType) LogicalType() *format.LogicalType { return nil } + +func (groupType) ConvertedType() *deprecated.ConvertedType { return nil } diff --git a/vendor/github.com/parquet-go/parquet-go/type_int32.go b/vendor/github.com/parquet-go/parquet-go/type_int32.go new file mode 100644 index 00000000000..f6497c76d0d --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_int32.go @@ -0,0 +1,119 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type int32Type struct{} + +func (t int32Type) String() string { return "INT32" } +func (t int32Type) Kind() Kind { return Int32 } +func (t int32Type) Length() int { return 32 } +func (t int32Type) EstimateSize(n int) int { return 4 * n } +func (t int32Type) EstimateNumValues(n int) int { return n / 4 } +func (t int32Type) Compare(a, b Value) int { return compareInt32(a.int32(), b.int32()) } +func (t int32Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } +func (t int32Type) LogicalType() *format.LogicalType { + return &format.LogicalType{Integer: &format.IntType{ + BitWidth: 32, + IsSigned: true, + }} +} +func (t int32Type) ConvertedType() *deprecated.ConvertedType { return nil } +func (t int32Type) PhysicalType() *format.Type { return &physicalTypes[Int32] } + +func (t int32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newInt32ColumnIndexer() +} + +func (t int32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t int32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t int32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t int32Type) NewValues(values []byte, _ []uint32) encoding.Values { + return encoding.Int32ValuesFromBytes(values) +} + +func (t int32Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeInt32(dst, src, enc) +} + +func (t int32Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeInt32(dst, src, enc) +} + +func (t int32Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.EstimateSize(numValues) +} + +func (t int32Type) AssignValue(dst reflect.Value, src Value) error { + v := src.int32() + switch dst.Kind() { + case reflect.Int8, reflect.Int16, reflect.Int32: + dst.SetInt(int64(v)) + case reflect.Uint8, reflect.Uint16, reflect.Uint32: + dst.SetUint(uint64(v)) + default: + dst.Set(reflect.ValueOf(v)) + } + return nil +} + +func (t int32Type) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *stringType: + return convertStringToInt32(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToInt32(val) + case Int32: + return val, nil + case Int64: + return convertInt64ToInt32(val) + case Int96: + return convertInt96ToInt32(val) + case Float: + return convertFloatToInt32(val) + case Double: + return convertDoubleToInt32(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToInt32(val) + default: + return makeValueKind(Int32), nil + } +} + +type uint32Type struct{ int32Type } + +func (t uint32Type) Compare(a, b Value) int { + return compareUint32(a.uint32(), b.uint32()) +} + +func (t uint32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newUint32ColumnIndexer() +} + +func (t uint32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t uint32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newUint32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t uint32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newUint32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_int64.go b/vendor/github.com/parquet-go/parquet-go/type_int64.go new file mode 100644 index 00000000000..9742a7f14b0 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_int64.go @@ -0,0 +1,119 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type int64Type struct{} + +func (t int64Type) String() string { return "INT64" } +func (t int64Type) Kind() Kind { return Int64 } +func (t int64Type) Length() int { return 64 } +func (t int64Type) EstimateSize(n int) int { return 8 * n } +func (t int64Type) EstimateNumValues(n int) int { return n / 8 } +func (t int64Type) Compare(a, b Value) int { return compareInt64(a.int64(), b.int64()) } +func (t int64Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } +func (t int64Type) LogicalType() *format.LogicalType { + return &format.LogicalType{Integer: &format.IntType{ + BitWidth: 64, + IsSigned: true, + }} +} +func (t int64Type) ConvertedType() *deprecated.ConvertedType { return nil } +func (t int64Type) PhysicalType() *format.Type { return &physicalTypes[Int64] } + +func (t int64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newInt64ColumnIndexer() +} + +func (t int64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t int64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t int64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t int64Type) NewValues(values []byte, _ []uint32) encoding.Values { + return encoding.Int64ValuesFromBytes(values) +} + +func (t int64Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeInt64(dst, src, enc) +} + +func (t int64Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeInt64(dst, src, enc) +} + +func (t int64Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.EstimateSize(numValues) +} + +func (t int64Type) AssignValue(dst reflect.Value, src Value) error { + v := src.int64() + switch dst.Kind() { + case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int: + dst.SetInt(v) + case reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uint, reflect.Uintptr: + dst.SetUint(uint64(v)) + default: + dst.Set(reflect.ValueOf(v)) + } + return nil +} + +func (t int64Type) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *stringType: + return convertStringToInt64(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToInt64(val) + case Int32: + return convertInt32ToInt64(val) + case Int64: + return val, nil + case Int96: + return convertInt96ToInt64(val) + case Float: + return convertFloatToInt64(val) + case Double: + return convertDoubleToInt64(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToInt64(val) + default: + return makeValueKind(Int64), nil + } +} + +type uint64Type struct{ int64Type } + +func (t uint64Type) Compare(a, b Value) int { + return compareUint64(a.uint64(), b.uint64()) +} + +func (t uint64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newUint64ColumnIndexer() +} + +func (t uint64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t uint64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newUint64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t uint64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newUint64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_int96.go b/vendor/github.com/parquet-go/parquet-go/type_int96.go new file mode 100644 index 00000000000..9abb6dfd1eb --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_int96.go @@ -0,0 +1,86 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type int96Type struct{} + +func (t int96Type) String() string { return "INT96" } + +func (t int96Type) Kind() Kind { return Int96 } +func (t int96Type) Length() int { return 96 } +func (t int96Type) EstimateSize(n int) int { return 12 * n } +func (t int96Type) EstimateNumValues(n int) int { return n / 12 } +func (t int96Type) Compare(a, b Value) int { return compareInt96(a.int96(), b.int96()) } +func (t int96Type) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } +func (t int96Type) LogicalType() *format.LogicalType { return nil } +func (t int96Type) ConvertedType() *deprecated.ConvertedType { return nil } +func (t int96Type) PhysicalType() *format.Type { return &physicalTypes[Int96] } + +func (t int96Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newInt96ColumnIndexer() +} + +func (t int96Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newInt96ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t int96Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newInt96Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t int96Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newInt96Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t int96Type) NewValues(values []byte, _ []uint32) encoding.Values { + return encoding.Int96ValuesFromBytes(values) +} + +func (t int96Type) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeInt96(dst, src, enc) +} + +func (t int96Type) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeInt96(dst, src, enc) +} + +func (t int96Type) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.EstimateSize(numValues) +} + +func (t int96Type) AssignValue(dst reflect.Value, src Value) error { + v := src.Int96() + dst.Set(reflect.ValueOf(v)) + return nil +} + +func (t int96Type) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *stringType: + return convertStringToInt96(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToInt96(val) + case Int32: + return convertInt32ToInt96(val) + case Int64: + return convertInt64ToInt96(val) + case Int96: + return val, nil + case Float: + return convertFloatToInt96(val) + case Double: + return convertDoubleToInt96(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToInt96(val) + default: + return makeValueKind(Int96), nil + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_int_logical.go b/vendor/github.com/parquet-go/parquet-go/type_int_logical.go new file mode 100644 index 00000000000..16c258f4ee1 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_int_logical.go @@ -0,0 +1,206 @@ +package parquet + +import ( + "fmt" + "math/bits" + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// Int constructs a leaf node of signed integer logical type of the given bit +// width. +// +// The bit width must be one of 8, 16, 32, 64, or the function will panic. +func Int(bitWidth int) Node { + return Leaf(integerType(bitWidth, &signedIntTypes)) +} + +// Uint constructs a leaf node of unsigned integer logical type of the given +// bit width. +// +// The bit width must be one of 8, 16, 32, 64, or the function will panic. +func Uint(bitWidth int) Node { + return Leaf(integerType(bitWidth, &unsignedIntTypes)) +} + +func integerType(bitWidth int, types *[4]intType) *intType { + switch bitWidth { + case 8: + return &types[0] + case 16: + return &types[1] + case 32: + return &types[2] + case 64: + return &types[3] + default: + panic(fmt.Sprintf("cannot create a %d bits parquet integer node", bitWidth)) + } +} + +var signedIntTypes = [...]intType{ + {BitWidth: 8, IsSigned: true}, + {BitWidth: 16, IsSigned: true}, + {BitWidth: 32, IsSigned: true}, + {BitWidth: 64, IsSigned: true}, +} + +var unsignedIntTypes = [...]intType{ + {BitWidth: 8, IsSigned: false}, + {BitWidth: 16, IsSigned: false}, + {BitWidth: 32, IsSigned: false}, + {BitWidth: 64, IsSigned: false}, +} + +var signedLogicalIntTypes = [...]format.LogicalType{ + {Integer: (*format.IntType)(&signedIntTypes[0])}, + {Integer: (*format.IntType)(&signedIntTypes[1])}, + {Integer: (*format.IntType)(&signedIntTypes[2])}, + {Integer: (*format.IntType)(&signedIntTypes[3])}, +} + +var unsignedLogicalIntTypes = [...]format.LogicalType{ + {Integer: (*format.IntType)(&unsignedIntTypes[0])}, + {Integer: (*format.IntType)(&unsignedIntTypes[1])}, + {Integer: (*format.IntType)(&unsignedIntTypes[2])}, + {Integer: (*format.IntType)(&unsignedIntTypes[3])}, +} + +type intType format.IntType + +func (t *intType) baseType() Type { + if t.IsSigned { + if t.BitWidth == 64 { + return int64Type{} + } else { + return int32Type{} + } + } else { + if t.BitWidth == 64 { + return uint64Type{} + } else { + return uint32Type{} + } + } +} + +func (t *intType) String() string { return (*format.IntType)(t).String() } + +func (t *intType) Kind() Kind { return t.baseType().Kind() } + +func (t *intType) Length() int { return int(t.BitWidth) } + +func (t *intType) EstimateSize(n int) int { return (int(t.BitWidth) / 8) * n } + +func (t *intType) EstimateNumValues(n int) int { return n / (int(t.BitWidth) / 8) } + +func (t *intType) Compare(a, b Value) int { + // This code is similar to t.baseType().Compare(a,b) but comparison methods + // tend to be invoked a lot (e.g. when sorting) so avoiding the interface + // indirection in this case yields much better throughput in some cases. + if t.BitWidth == 64 { + i1 := a.int64() + i2 := b.int64() + if t.IsSigned { + return compareInt64(i1, i2) + } else { + return compareUint64(uint64(i1), uint64(i2)) + } + } else { + i1 := a.int32() + i2 := b.int32() + if t.IsSigned { + return compareInt32(i1, i2) + } else { + return compareUint32(uint32(i1), uint32(i2)) + } + } +} + +func (t *intType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() } + +func (t *intType) PhysicalType() *format.Type { return t.baseType().PhysicalType() } + +func (t *intType) LogicalType() *format.LogicalType { + switch t { + case &signedIntTypes[0]: + return &signedLogicalIntTypes[0] + case &signedIntTypes[1]: + return &signedLogicalIntTypes[1] + case &signedIntTypes[2]: + return &signedLogicalIntTypes[2] + case &signedIntTypes[3]: + return &signedLogicalIntTypes[3] + case &unsignedIntTypes[0]: + return &unsignedLogicalIntTypes[0] + case &unsignedIntTypes[1]: + return &unsignedLogicalIntTypes[1] + case &unsignedIntTypes[2]: + return &unsignedLogicalIntTypes[2] + case &unsignedIntTypes[3]: + return &unsignedLogicalIntTypes[3] + default: + return &format.LogicalType{Integer: (*format.IntType)(t)} + } +} + +func (t *intType) ConvertedType() *deprecated.ConvertedType { + convertedType := bits.Len8(uint8(t.BitWidth)/8) - 1 // 8=>0, 16=>1, 32=>2, 64=>4 + if t.IsSigned { + convertedType += int(deprecated.Int8) + } else { + convertedType += int(deprecated.Uint8) + } + return &convertedTypes[convertedType] +} + +func (t *intType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return t.baseType().NewColumnIndexer(sizeLimit) +} + +func (t *intType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return t.baseType().NewColumnBuffer(columnIndex, numValues) +} + +func (t *intType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return t.baseType().NewDictionary(columnIndex, numValues, data) +} + +func (t *intType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return t.baseType().NewPage(columnIndex, numValues, data) +} + +func (t *intType) NewValues(values []byte, offsets []uint32) encoding.Values { + return t.baseType().NewValues(values, offsets) +} + +func (t *intType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return t.baseType().Encode(dst, src, enc) +} + +func (t *intType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return t.baseType().Decode(dst, src, enc) +} + +func (t *intType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.baseType().EstimateDecodeSize(numValues, src, enc) +} + +func (t *intType) AssignValue(dst reflect.Value, src Value) error { + if t.BitWidth == 64 { + return int64Type{}.AssignValue(dst, src) + } else { + return int32Type{}.AssignValue(dst, src) + } +} + +func (t *intType) ConvertValue(val Value, typ Type) (Value, error) { + if t.BitWidth == 64 { + return int64Type{}.ConvertValue(val, typ) + } else { + return int32Type{}.ConvertValue(val, typ) + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_json.go b/vendor/github.com/parquet-go/parquet-go/type_json.go new file mode 100644 index 00000000000..724866c85ac --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_json.go @@ -0,0 +1,106 @@ +package parquet + +import ( + "encoding/json" + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// JSON constructs a leaf node of JSON logical type. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#json +func JSON() Node { return Leaf(&jsonType{}) } + +var jsonLogicalType = format.LogicalType{ + Json: new(format.JsonType), +} + +type jsonType format.JsonType + +func (t *jsonType) String() string { return (*format.JsonType)(t).String() } + +func (t *jsonType) Kind() Kind { return byteArrayType{}.Kind() } + +func (t *jsonType) Length() int { return byteArrayType{}.Length() } + +func (t *jsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } + +func (t *jsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } + +func (t *jsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) } + +func (t *jsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() } + +func (t *jsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() } + +func (t *jsonType) LogicalType() *format.LogicalType { return &jsonLogicalType } + +func (t *jsonType) ConvertedType() *deprecated.ConvertedType { + return &convertedTypes[deprecated.Json] +} + +func (t *jsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return byteArrayType{}.NewColumnIndexer(sizeLimit) +} + +func (t *jsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return byteArrayType{}.NewDictionary(columnIndex, numValues, data) +} + +func (t *jsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return byteArrayType{}.NewColumnBuffer(columnIndex, numValues) +} + +func (t *jsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return byteArrayType{}.NewPage(columnIndex, numValues, data) +} + +func (t *jsonType) NewValues(values []byte, offsets []uint32) encoding.Values { + return byteArrayType{}.NewValues(values, offsets) +} + +func (t *jsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return byteArrayType{}.Encode(dst, src, enc) +} + +func (t *jsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return byteArrayType{}.Decode(dst, src, enc) +} + +func (t *jsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) +} + +func (t *jsonType) AssignValue(dst reflect.Value, src Value) error { + // Assign value using ByteArrayType for BC... + switch dst.Kind() { + case reflect.String: + return byteArrayType{}.AssignValue(dst, src) + case reflect.Slice: + if dst.Type().Elem().Kind() == reflect.Uint8 { + return byteArrayType{}.AssignValue(dst, src) + } + } + + // Otherwise handle with json.Unmarshal + b := src.byteArray() + val := reflect.New(dst.Type()).Elem() + err := json.Unmarshal(b, val.Addr().Interface()) + if err != nil { + return err + } + dst.Set(val) + return nil +} + +func (t *jsonType) ConvertValue(val Value, typ Type) (Value, error) { + switch typ.(type) { + case *byteArrayType, *stringType, *jsonType: + return val, nil + default: + return val, invalidConversion(val, "JSON", typ.String()) + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_list.go b/vendor/github.com/parquet-go/parquet-go/type_list.go new file mode 100644 index 00000000000..20a1bbf8663 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_list.go @@ -0,0 +1,86 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// List constructs a node of LIST logical type. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists +func List(of Node) Node { + return listNode{Group{"list": Repeated(Group{"element": of})}} +} + +type listNode struct{ Group } + +func (listNode) Type() Type { return &listType{} } + +type listType format.ListType + +func (t *listType) String() string { return (*format.ListType)(t).String() } + +func (t *listType) Kind() Kind { panic("cannot call Kind on parquet LIST type") } + +func (t *listType) Length() int { return 0 } + +func (t *listType) EstimateSize(int) int { return 0 } + +func (t *listType) EstimateNumValues(int) int { return 0 } + +func (t *listType) Compare(Value, Value) int { panic("cannot compare values on parquet LIST type") } + +func (t *listType) ColumnOrder() *format.ColumnOrder { return nil } + +func (t *listType) PhysicalType() *format.Type { return nil } + +func (t *listType) LogicalType() *format.LogicalType { + return &format.LogicalType{List: (*format.ListType)(t)} +} + +func (t *listType) ConvertedType() *deprecated.ConvertedType { + return &convertedTypes[deprecated.List] +} + +func (t *listType) NewColumnIndexer(int) ColumnIndexer { + panic("create create column indexer from parquet LIST type") +} + +func (t *listType) NewDictionary(int, int, encoding.Values) Dictionary { + panic("cannot create dictionary from parquet LIST type") +} + +func (t *listType) NewColumnBuffer(int, int) ColumnBuffer { + panic("cannot create column buffer from parquet LIST type") +} + +func (t *listType) NewPage(int, int, encoding.Values) Page { + panic("cannot create page from parquet LIST type") +} + +func (t *listType) NewValues(values []byte, _ []uint32) encoding.Values { + panic("cannot create values from parquet LIST type") +} + +func (t *listType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { + panic("cannot encode parquet LIST type") +} + +func (t *listType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { + panic("cannot decode parquet LIST type") +} + +func (t *listType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { + panic("cannot estimate decode size of parquet LIST type") +} + +func (t *listType) AssignValue(reflect.Value, Value) error { + panic("cannot assign value to a parquet LIST type") +} + +func (t *listType) ConvertValue(Value, Type) (Value, error) { + panic("cannot convert value to a parquet LIST type") +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_map.go b/vendor/github.com/parquet-go/parquet-go/type_map.go new file mode 100644 index 00000000000..d5f6a95cc55 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_map.go @@ -0,0 +1,91 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// Map constructs a node of MAP logical type. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps +func Map(key, value Node) Node { + return mapNode{Group{ + "key_value": Repeated(Group{ + "key": Required(key), + "value": value, + }), + }} +} + +type mapNode struct{ Group } + +func (mapNode) Type() Type { return &mapType{} } + +type mapType format.MapType + +func (t *mapType) String() string { return (*format.MapType)(t).String() } + +func (t *mapType) Kind() Kind { panic("cannot call Kind on parquet MAP type") } + +func (t *mapType) Length() int { return 0 } + +func (t *mapType) EstimateSize(int) int { return 0 } + +func (t *mapType) EstimateNumValues(int) int { return 0 } + +func (t *mapType) Compare(Value, Value) int { panic("cannot compare values on parquet MAP type") } + +func (t *mapType) ColumnOrder() *format.ColumnOrder { return nil } + +func (t *mapType) PhysicalType() *format.Type { return nil } + +func (t *mapType) LogicalType() *format.LogicalType { + return &format.LogicalType{Map: (*format.MapType)(t)} +} + +func (t *mapType) ConvertedType() *deprecated.ConvertedType { + return &convertedTypes[deprecated.Map] +} + +func (t *mapType) NewColumnIndexer(int) ColumnIndexer { + panic("create create column indexer from parquet MAP type") +} + +func (t *mapType) NewDictionary(int, int, encoding.Values) Dictionary { + panic("cannot create dictionary from parquet MAP type") +} + +func (t *mapType) NewColumnBuffer(int, int) ColumnBuffer { + panic("cannot create column buffer from parquet MAP type") +} + +func (t *mapType) NewPage(int, int, encoding.Values) Page { + panic("cannot create page from parquet MAP type") +} + +func (t *mapType) NewValues(values []byte, _ []uint32) encoding.Values { + panic("cannot create values from parquet MAP type") +} + +func (t *mapType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { + panic("cannot encode parquet MAP type") +} + +func (t *mapType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { + panic("cannot decode parquet MAP type") +} + +func (t *mapType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { + panic("cannot estimate decode size of parquet MAP type") +} + +func (t *mapType) AssignValue(reflect.Value, Value) error { + panic("cannot assign value to a parquet MAP type") +} + +func (t *mapType) ConvertValue(Value, Type) (Value, error) { + panic("cannot convert value to a parquet MAP type") +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_null.go b/vendor/github.com/parquet-go/parquet-go/type_null.go new file mode 100644 index 00000000000..ca1947d9cea --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_null.go @@ -0,0 +1,73 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +type nullType format.NullType + +func (t *nullType) String() string { return (*format.NullType)(t).String() } + +func (t *nullType) Kind() Kind { return -1 } + +func (t *nullType) Length() int { return 0 } + +func (t *nullType) EstimateSize(int) int { return 0 } + +func (t *nullType) EstimateNumValues(int) int { return 0 } + +func (t *nullType) Compare(Value, Value) int { panic("cannot compare values on parquet NULL type") } + +func (t *nullType) ColumnOrder() *format.ColumnOrder { return nil } + +func (t *nullType) PhysicalType() *format.Type { return nil } + +func (t *nullType) LogicalType() *format.LogicalType { + return &format.LogicalType{Unknown: (*format.NullType)(t)} +} + +func (t *nullType) ConvertedType() *deprecated.ConvertedType { return nil } + +func (t *nullType) NewColumnIndexer(int) ColumnIndexer { + panic("create create column indexer from parquet NULL type") +} + +func (t *nullType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newNullDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t *nullType) NewColumnBuffer(int, int) ColumnBuffer { + panic("cannot create column buffer from parquet NULL type") +} + +func (t *nullType) NewPage(columnIndex, numValues int, _ encoding.Values) Page { + return newNullPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t *nullType) NewValues(_ []byte, _ []uint32) encoding.Values { + return encoding.Values{} +} + +func (t *nullType) Encode(dst []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { + return dst[:0], nil +} + +func (t *nullType) Decode(dst encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { + return dst, nil +} + +func (t *nullType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { + return 0 +} + +func (t *nullType) AssignValue(reflect.Value, Value) error { + return nil +} + +func (t *nullType) ConvertValue(val Value, _ Type) (Value, error) { + return val, nil +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_string.go b/vendor/github.com/parquet-go/parquet-go/type_string.go new file mode 100644 index 00000000000..cdd9afb7873 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_string.go @@ -0,0 +1,121 @@ +package parquet + +import ( + "bytes" + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// String constructs a leaf node of UTF8 logical type. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#string +func String() Node { return Leaf(&stringType{}) } + +var stringLogicalType = format.LogicalType{ + UTF8: new(format.StringType), +} + +type stringType format.StringType + +func (t *stringType) String() string { return (*format.StringType)(t).String() } + +func (t *stringType) Kind() Kind { return ByteArray } + +func (t *stringType) Length() int { return 0 } + +func (t *stringType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } + +func (t *stringType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } + +func (t *stringType) Compare(a, b Value) int { + return bytes.Compare(a.byteArray(), b.byteArray()) +} + +func (t *stringType) ColumnOrder() *format.ColumnOrder { + return &typeDefinedColumnOrder +} + +func (t *stringType) PhysicalType() *format.Type { + return &physicalTypes[ByteArray] +} + +func (t *stringType) LogicalType() *format.LogicalType { + return &stringLogicalType +} + +func (t *stringType) ConvertedType() *deprecated.ConvertedType { + return &convertedTypes[deprecated.UTF8] +} + +func (t *stringType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newByteArrayColumnIndexer(sizeLimit) +} + +func (t *stringType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t *stringType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t *stringType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t *stringType) NewValues(values []byte, offsets []uint32) encoding.Values { + return encoding.ByteArrayValues(values, offsets) +} + +func (t *stringType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return encoding.EncodeByteArray(dst, src, enc) +} + +func (t *stringType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return encoding.DecodeByteArray(dst, src, enc) +} + +func (t *stringType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) +} + +func (t *stringType) AssignValue(dst reflect.Value, src Value) error { + return byteArrayType{}.AssignValue(dst, src) +} + +func (t *stringType) ConvertValue(val Value, typ Type) (Value, error) { + switch t2 := typ.(type) { + case *dateType: + return convertDateToString(val) + case *timeType: + tz := t2.tz() + if t2.Unit.Micros != nil { + return convertTimeMicrosToString(val, tz) + } else { + return convertTimeMillisToString(val, tz) + } + } + switch typ.Kind() { + case Boolean: + return convertBooleanToString(val) + case Int32: + return convertInt32ToString(val) + case Int64: + return convertInt64ToString(val) + case Int96: + return convertInt96ToString(val) + case Float: + return convertFloatToString(val) + case Double: + return convertDoubleToString(val) + case ByteArray: + return val, nil + case FixedLenByteArray: + return convertFixedLenByteArrayToString(val) + default: + return makeValueKind(ByteArray), nil + } +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_time.go b/vendor/github.com/parquet-go/parquet-go/type_time.go new file mode 100644 index 00000000000..e3ebe572ec1 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_time.go @@ -0,0 +1,262 @@ +package parquet + +import ( + "reflect" + "time" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// TimeUnit represents units of time in the parquet type system. +type TimeUnit interface { + // Returns the precision of the time unit as a time.Duration value. + Duration() time.Duration + // Converts the TimeUnit value to its representation in the parquet thrift + // format. + TimeUnit() format.TimeUnit +} + +var ( + Millisecond TimeUnit = &millisecond{} + Microsecond TimeUnit = µsecond{} + Nanosecond TimeUnit = &nanosecond{} +) + +type millisecond format.MilliSeconds + +func (u *millisecond) Duration() time.Duration { return time.Millisecond } +func (u *millisecond) TimeUnit() format.TimeUnit { + return format.TimeUnit{Millis: (*format.MilliSeconds)(u)} +} + +type microsecond format.MicroSeconds + +func (u *microsecond) Duration() time.Duration { return time.Microsecond } +func (u *microsecond) TimeUnit() format.TimeUnit { + return format.TimeUnit{Micros: (*format.MicroSeconds)(u)} +} + +type nanosecond format.NanoSeconds + +func (u *nanosecond) Duration() time.Duration { return time.Nanosecond } +func (u *nanosecond) TimeUnit() format.TimeUnit { + return format.TimeUnit{Nanos: (*format.NanoSeconds)(u)} +} + +// Time constructs a leaf node of TIME logical type. +// IsAdjustedToUTC is true by default. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time +func Time(unit TimeUnit) Node { + return TimeAdjusted(unit, true) +} + +// TimeAdjusted constructs a leaf node of TIME logical type +// with the IsAdjustedToUTC property explicitly set. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time +func TimeAdjusted(unit TimeUnit, isAdjustedToUTC bool) Node { + // Use pre-allocated instances for common cases + timeUnit := unit.TimeUnit() + if isAdjustedToUTC { + switch { + case timeUnit.Millis != nil: + return Leaf(&timeMilliAdjustedToUTC) + case timeUnit.Micros != nil: + return Leaf(&timeMicroAdjustedToUTC) + case timeUnit.Nanos != nil: + return Leaf(&timeNanoAdjustedToUTC) + } + } else { + switch { + case timeUnit.Millis != nil: + return Leaf(&timeMilliNotAdjustedToUTC) + case timeUnit.Micros != nil: + return Leaf(&timeMicroNotAdjustedToUTC) + case timeUnit.Nanos != nil: + return Leaf(&timeNanoNotAdjustedToUTC) + } + } + // Fallback for unknown unit types + return Leaf(&timeType{IsAdjustedToUTC: isAdjustedToUTC, Unit: timeUnit}) +} + +var timeMilliAdjustedToUTC = timeType{ + IsAdjustedToUTC: true, + Unit: format.TimeUnit{Millis: new(format.MilliSeconds)}, +} + +var timeMicroAdjustedToUTC = timeType{ + IsAdjustedToUTC: true, + Unit: format.TimeUnit{Micros: new(format.MicroSeconds)}, +} + +var timeNanoAdjustedToUTC = timeType{ + IsAdjustedToUTC: true, + Unit: format.TimeUnit{Nanos: new(format.NanoSeconds)}, +} + +var timeMilliNotAdjustedToUTC = timeType{ + IsAdjustedToUTC: false, + Unit: format.TimeUnit{Millis: new(format.MilliSeconds)}, +} + +var timeMicroNotAdjustedToUTC = timeType{ + IsAdjustedToUTC: false, + Unit: format.TimeUnit{Micros: new(format.MicroSeconds)}, +} + +var timeNanoNotAdjustedToUTC = timeType{ + IsAdjustedToUTC: false, + Unit: format.TimeUnit{Nanos: new(format.NanoSeconds)}, +} + +var timeMilliAdjustedToUTCLogicalType = format.LogicalType{ + Time: (*format.TimeType)(&timeMilliAdjustedToUTC), +} + +var timeMicroAdjustedToUTCLogicalType = format.LogicalType{ + Time: (*format.TimeType)(&timeMicroAdjustedToUTC), +} + +var timeNanoAdjustedToUTCLogicalType = format.LogicalType{ + Time: (*format.TimeType)(&timeNanoAdjustedToUTC), +} + +var timeMilliNotAdjustedToUTCLogicalType = format.LogicalType{ + Time: (*format.TimeType)(&timeMilliNotAdjustedToUTC), +} + +var timeMicroNotAdjustedToUTCLogicalType = format.LogicalType{ + Time: (*format.TimeType)(&timeMicroNotAdjustedToUTC), +} + +var timeNanoNotAdjustedToUTCLogicalType = format.LogicalType{ + Time: (*format.TimeType)(&timeNanoNotAdjustedToUTC), +} + +type timeType format.TimeType + +func (t *timeType) tz() *time.Location { + if t.IsAdjustedToUTC { + return time.UTC + } else { + return time.Local + } +} + +func (t *timeType) baseType() Type { + if t.useInt32() { + return int32Type{} + } else { + return int64Type{} + } +} + +func (t *timeType) useInt32() bool { return t.Unit.Millis != nil } + +func (t *timeType) useInt64() bool { return t.Unit.Micros != nil } + +func (t *timeType) String() string { return (*format.TimeType)(t).String() } + +func (t *timeType) Kind() Kind { return t.baseType().Kind() } + +func (t *timeType) Length() int { return t.baseType().Length() } + +func (t *timeType) EstimateSize(n int) int { return t.baseType().EstimateSize(n) } + +func (t *timeType) EstimateNumValues(n int) int { return t.baseType().EstimateNumValues(n) } + +func (t *timeType) Compare(a, b Value) int { return t.baseType().Compare(a, b) } + +func (t *timeType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() } + +func (t *timeType) PhysicalType() *format.Type { return t.baseType().PhysicalType() } + +func (t *timeType) LogicalType() *format.LogicalType { + switch t { + case &timeMilliAdjustedToUTC: + return &timeMilliAdjustedToUTCLogicalType + case &timeMicroAdjustedToUTC: + return &timeMicroAdjustedToUTCLogicalType + case &timeNanoAdjustedToUTC: + return &timeNanoAdjustedToUTCLogicalType + case &timeMilliNotAdjustedToUTC: + return &timeMilliNotAdjustedToUTCLogicalType + case &timeMicroNotAdjustedToUTC: + return &timeMicroNotAdjustedToUTCLogicalType + case &timeNanoNotAdjustedToUTC: + return &timeNanoNotAdjustedToUTCLogicalType + default: + return &format.LogicalType{Time: (*format.TimeType)(t)} + } +} + +func (t *timeType) ConvertedType() *deprecated.ConvertedType { + switch { + case t.useInt32(): + return &convertedTypes[deprecated.TimeMillis] + case t.useInt64(): + return &convertedTypes[deprecated.TimeMicros] + default: + return nil + } +} + +func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return t.baseType().NewColumnIndexer(sizeLimit) +} + +func (t *timeType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return t.baseType().NewColumnBuffer(columnIndex, numValues) +} + +func (t *timeType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return t.baseType().NewDictionary(columnIndex, numValues, data) +} + +func (t *timeType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return t.baseType().NewPage(columnIndex, numValues, data) +} + +func (t *timeType) NewValues(values []byte, offset []uint32) encoding.Values { + return t.baseType().NewValues(values, offset) +} + +func (t *timeType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return t.baseType().Encode(dst, src, enc) +} + +func (t *timeType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return t.baseType().Decode(dst, src, enc) +} + +func (t *timeType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return t.baseType().EstimateDecodeSize(numValues, src, enc) +} + +func (t *timeType) AssignValue(dst reflect.Value, src Value) error { + return t.baseType().AssignValue(dst, src) +} + +func (t *timeType) ConvertValue(val Value, typ Type) (Value, error) { + switch src := typ.(type) { + case *stringType: + tz := t.tz() + if t.Unit.Micros != nil { + return convertStringToTimeMicros(val, tz) + } else { + return convertStringToTimeMillis(val, tz) + } + case *timestampType: + tz := t.tz() + if t.Unit.Micros != nil { + return convertTimestampToTimeMicros(val, src.Unit, src.tz(), tz) + } else { + return convertTimestampToTimeMillis(val, src.Unit, src.tz(), tz) + } + } + return t.baseType().ConvertValue(val, typ) +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_timestamp.go b/vendor/github.com/parquet-go/parquet-go/type_timestamp.go new file mode 100644 index 00000000000..08d58633a1f --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_timestamp.go @@ -0,0 +1,257 @@ +package parquet + +import ( + "reflect" + "time" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// Timestamp constructs of leaf node of TIMESTAMP logical type. +// IsAdjustedToUTC is true by default. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp +func Timestamp(unit TimeUnit) Node { + return TimestampAdjusted(unit, true) +} + +// TimestampAdjusted constructs a leaf node of TIMESTAMP logical type +// with the IsAdjustedToUTC property explicitly set. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#time +func TimestampAdjusted(unit TimeUnit, isAdjustedToUTC bool) Node { + // Use pre-allocated instances for common cases + timeUnit := unit.TimeUnit() + if isAdjustedToUTC { + switch { + case timeUnit.Millis != nil: + return Leaf(×tampMilliAdjustedToUTC) + case timeUnit.Micros != nil: + return Leaf(×tampMicroAdjustedToUTC) + case timeUnit.Nanos != nil: + return Leaf(×tampNanoAdjustedToUTC) + } + } else { + switch { + case timeUnit.Millis != nil: + return Leaf(×tampMilliNotAdjustedToUTC) + case timeUnit.Micros != nil: + return Leaf(×tampMicroNotAdjustedToUTC) + case timeUnit.Nanos != nil: + return Leaf(×tampNanoNotAdjustedToUTC) + } + } + // Fallback for unknown unit types + return Leaf(×tampType{IsAdjustedToUTC: isAdjustedToUTC, Unit: timeUnit}) +} + +var timestampMilliAdjustedToUTC = timestampType{ + IsAdjustedToUTC: true, + Unit: format.TimeUnit{Millis: new(format.MilliSeconds)}, +} + +var timestampMicroAdjustedToUTC = timestampType{ + IsAdjustedToUTC: true, + Unit: format.TimeUnit{Micros: new(format.MicroSeconds)}, +} + +var timestampNanoAdjustedToUTC = timestampType{ + IsAdjustedToUTC: true, + Unit: format.TimeUnit{Nanos: new(format.NanoSeconds)}, +} + +var timestampMilliNotAdjustedToUTC = timestampType{ + IsAdjustedToUTC: false, + Unit: format.TimeUnit{Millis: new(format.MilliSeconds)}, +} + +var timestampMicroNotAdjustedToUTC = timestampType{ + IsAdjustedToUTC: false, + Unit: format.TimeUnit{Micros: new(format.MicroSeconds)}, +} + +var timestampNanoNotAdjustedToUTC = timestampType{ + IsAdjustedToUTC: false, + Unit: format.TimeUnit{Nanos: new(format.NanoSeconds)}, +} + +var timestampMilliAdjustedToUTCLogicalType = format.LogicalType{ + Timestamp: (*format.TimestampType)(×tampMilliAdjustedToUTC), +} + +var timestampMicroAdjustedToUTCLogicalType = format.LogicalType{ + Timestamp: (*format.TimestampType)(×tampMicroAdjustedToUTC), +} + +var timestampNanoAdjustedToUTCLogicalType = format.LogicalType{ + Timestamp: (*format.TimestampType)(×tampNanoAdjustedToUTC), +} + +var timestampMilliNotAdjustedToUTCLogicalType = format.LogicalType{ + Timestamp: (*format.TimestampType)(×tampMilliNotAdjustedToUTC), +} + +var timestampMicroNotAdjustedToUTCLogicalType = format.LogicalType{ + Timestamp: (*format.TimestampType)(×tampMicroNotAdjustedToUTC), +} + +var timestampNanoNotAdjustedToUTCLogicalType = format.LogicalType{ + Timestamp: (*format.TimestampType)(×tampNanoNotAdjustedToUTC), +} + +type timestampType format.TimestampType + +func (t *timestampType) tz() *time.Location { + if t.IsAdjustedToUTC { + return time.UTC + } else { + return time.Local + } +} + +func (t *timestampType) String() string { return (*format.TimestampType)(t).String() } + +func (t *timestampType) Kind() Kind { return int64Type{}.Kind() } + +func (t *timestampType) Length() int { return int64Type{}.Length() } + +func (t *timestampType) EstimateSize(n int) int { return int64Type{}.EstimateSize(n) } + +func (t *timestampType) EstimateNumValues(n int) int { return int64Type{}.EstimateNumValues(n) } + +func (t *timestampType) Compare(a, b Value) int { return int64Type{}.Compare(a, b) } + +func (t *timestampType) ColumnOrder() *format.ColumnOrder { return int64Type{}.ColumnOrder() } + +func (t *timestampType) PhysicalType() *format.Type { return int64Type{}.PhysicalType() } + +func (t *timestampType) LogicalType() *format.LogicalType { + switch t { + case ×tampMilliAdjustedToUTC: + return ×tampMilliAdjustedToUTCLogicalType + case ×tampMicroAdjustedToUTC: + return ×tampMicroAdjustedToUTCLogicalType + case ×tampNanoAdjustedToUTC: + return ×tampNanoAdjustedToUTCLogicalType + case ×tampMilliNotAdjustedToUTC: + return ×tampMilliNotAdjustedToUTCLogicalType + case ×tampMicroNotAdjustedToUTC: + return ×tampMicroNotAdjustedToUTCLogicalType + case ×tampNanoNotAdjustedToUTC: + return ×tampNanoNotAdjustedToUTCLogicalType + default: + return &format.LogicalType{Timestamp: (*format.TimestampType)(t)} + } +} + +func (t *timestampType) ConvertedType() *deprecated.ConvertedType { + switch { + case t.Unit.Millis != nil: + return &convertedTypes[deprecated.TimestampMillis] + case t.Unit.Micros != nil: + return &convertedTypes[deprecated.TimestampMicros] + default: + return nil + } +} + +func (t *timestampType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return int64Type{}.NewColumnIndexer(sizeLimit) +} + +func (t *timestampType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return int64Type{}.NewDictionary(columnIndex, numValues, data) +} + +func (t *timestampType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return int64Type{}.NewColumnBuffer(columnIndex, numValues) +} + +func (t *timestampType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return int64Type{}.NewPage(columnIndex, numValues, data) +} + +func (t *timestampType) NewValues(values []byte, offsets []uint32) encoding.Values { + return int64Type{}.NewValues(values, offsets) +} + +func (t *timestampType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return int64Type{}.Encode(dst, src, enc) +} + +func (t *timestampType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return int64Type{}.Decode(dst, src, enc) +} + +func (t *timestampType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return int64Type{}.EstimateDecodeSize(numValues, src, enc) +} + +func (t *timestampType) AssignValue(dst reflect.Value, src Value) error { + switch dst.Type() { + case reflect.TypeOf(time.Time{}): + // Check if the value is NULL - if so, assign zero time.Time + if src.IsNull() { + dst.Set(reflect.ValueOf(time.Time{})) + return nil + } + + unit := Nanosecond.TimeUnit() + lt := t.LogicalType() + if lt != nil && lt.Timestamp != nil { + unit = lt.Timestamp.Unit + } + + nanos := src.int64() + switch { + case unit.Millis != nil: + nanos = nanos * 1e6 + case unit.Micros != nil: + nanos = nanos * 1e3 + } + + val := time.Unix(0, nanos).UTC() + dst.Set(reflect.ValueOf(val)) + return nil + case reflect.TypeOf((*time.Time)(nil)): + // Handle *time.Time (pointer to time.Time) + if src.IsNull() { + // For NULL values, set the pointer to nil + dst.Set(reflect.Zero(dst.Type())) + return nil + } + + unit := Nanosecond.TimeUnit() + lt := t.LogicalType() + if lt != nil && lt.Timestamp != nil { + unit = lt.Timestamp.Unit + } + + nanos := src.int64() + switch { + case unit.Millis != nil: + nanos = nanos * 1e6 + case unit.Micros != nil: + nanos = nanos * 1e3 + } + + val := time.Unix(0, nanos).UTC() + ptr := &val + dst.Set(reflect.ValueOf(ptr)) + return nil + default: + return int64Type{}.AssignValue(dst, src) + } +} + +func (t *timestampType) ConvertValue(val Value, typ Type) (Value, error) { + switch src := typ.(type) { + case *timestampType: + return convertTimestampToTimestamp(val, src.Unit, t.Unit) + case *dateType: + return convertDateToTimestamp(val, t.Unit, t.tz()) + } + return int64Type{}.ConvertValue(val, typ) +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_uuid.go b/vendor/github.com/parquet-go/parquet-go/type_uuid.go new file mode 100644 index 00000000000..1cf2a4f94fc --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_uuid.go @@ -0,0 +1,80 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// UUID constructs a leaf node of UUID logical type. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#uuid +func UUID() Node { return Leaf(&uuidType{}) } + +var uuidLogicaType = format.LogicalType{ + UUID: new(format.UUIDType), +} + +type uuidType format.UUIDType + +func (t *uuidType) String() string { return (*format.UUIDType)(t).String() } + +func (t *uuidType) Kind() Kind { return be128Type{}.Kind() } + +func (t *uuidType) Length() int { return be128Type{}.Length() } + +func (t *uuidType) EstimateSize(n int) int { return be128Type{}.EstimateSize(n) } + +func (t *uuidType) EstimateNumValues(n int) int { return be128Type{}.EstimateNumValues(n) } + +func (t *uuidType) Compare(a, b Value) int { return be128Type{}.Compare(a, b) } + +func (t *uuidType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } + +func (t *uuidType) PhysicalType() *format.Type { return &physicalTypes[FixedLenByteArray] } + +func (t *uuidType) LogicalType() *format.LogicalType { return &uuidLogicaType } + +func (t *uuidType) ConvertedType() *deprecated.ConvertedType { return nil } + +func (t *uuidType) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return be128Type{isUUID: true}.NewColumnIndexer(sizeLimit) +} + +func (t *uuidType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return be128Type{isUUID: true}.NewDictionary(columnIndex, numValues, data) +} + +func (t *uuidType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return be128Type{isUUID: true}.NewColumnBuffer(columnIndex, numValues) +} + +func (t *uuidType) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return be128Type{isUUID: true}.NewPage(columnIndex, numValues, data) +} + +func (t *uuidType) NewValues(values []byte, offsets []uint32) encoding.Values { + return be128Type{isUUID: true}.NewValues(values, offsets) +} + +func (t *uuidType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { + return be128Type{isUUID: true}.Encode(dst, src, enc) +} + +func (t *uuidType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { + return be128Type{isUUID: true}.Decode(dst, src, enc) +} + +func (t *uuidType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { + return be128Type{isUUID: true}.EstimateDecodeSize(numValues, src, enc) +} + +func (t *uuidType) AssignValue(dst reflect.Value, src Value) error { + return be128Type{isUUID: true}.AssignValue(dst, src) +} + +func (t *uuidType) ConvertValue(val Value, typ Type) (Value, error) { + return be128Type{isUUID: true}.ConvertValue(val, typ) +} diff --git a/vendor/github.com/parquet-go/parquet-go/type_variant.go b/vendor/github.com/parquet-go/parquet-go/type_variant.go new file mode 100644 index 00000000000..fd6cc06c972 --- /dev/null +++ b/vendor/github.com/parquet-go/parquet-go/type_variant.go @@ -0,0 +1,97 @@ +package parquet + +import ( + "reflect" + + "github.com/parquet-go/parquet-go/deprecated" + "github.com/parquet-go/parquet-go/encoding" + "github.com/parquet-go/parquet-go/format" +) + +// Variant constructs a node of unshredded VARIANT logical type. It is a group with +// two required fields, "metadata" and "value", both byte arrays. +// +// Experimental: The specification for variants is still being developed and the type +// is not fully adopted. Support for this type is subject to change. +// +// Initial support does not attempt to process the variant data. So reading and writing +// data of this type behaves as if it were just a group with two byte array fields, as +// if the logical type annotation were absent. This may change in the future. +// +// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#variant +func Variant() Node { + return variantNode{Group{"metadata": Required(Leaf(ByteArrayType)), "value": Required(Leaf(ByteArrayType))}} +} + +// TODO: add ShreddedVariant(Node) function, to create a shredded variant +// where the argument defines the type/structure of the shredded value(s). + +type variantNode struct{ Group } + +func (variantNode) Type() Type { return &variantType{} } + +type variantType format.VariantType + +func (t *variantType) String() string { return (*format.VariantType)(t).String() } + +func (t *variantType) Kind() Kind { panic("cannot call Kind on parquet VARIANT type") } + +func (t *variantType) Length() int { return 0 } + +func (t *variantType) EstimateSize(int) int { return 0 } + +func (t *variantType) EstimateNumValues(int) int { return 0 } + +func (t *variantType) Compare(Value, Value) int { + panic("cannot compare values on parquet VARIANT type") +} + +func (t *variantType) ColumnOrder() *format.ColumnOrder { return nil } + +func (t *variantType) PhysicalType() *format.Type { return nil } + +func (t *variantType) LogicalType() *format.LogicalType { + return &format.LogicalType{Variant: (*format.VariantType)(t)} +} + +func (t *variantType) ConvertedType() *deprecated.ConvertedType { return nil } + +func (t *variantType) NewColumnIndexer(int) ColumnIndexer { + panic("create create column indexer from parquet VARIANT type") +} + +func (t *variantType) NewDictionary(int, int, encoding.Values) Dictionary { + panic("cannot create dictionary from parquet VARIANT type") +} + +func (t *variantType) NewColumnBuffer(int, int) ColumnBuffer { + panic("cannot create column buffer from parquet VARIANT type") +} + +func (t *variantType) NewPage(int, int, encoding.Values) Page { + panic("cannot create page from parquet VARIANT type") +} + +func (t *variantType) NewValues(values []byte, _ []uint32) encoding.Values { + panic("cannot create values from parquet VARIANT type") +} + +func (t *variantType) Encode(_ []byte, _ encoding.Values, _ encoding.Encoding) ([]byte, error) { + panic("cannot encode parquet VARIANT type") +} + +func (t *variantType) Decode(_ encoding.Values, _ []byte, _ encoding.Encoding) (encoding.Values, error) { + panic("cannot decode parquet VARIANT type") +} + +func (t *variantType) EstimateDecodeSize(_ int, _ []byte, _ encoding.Encoding) int { + panic("cannot estimate decode size of parquet VARIANT type") +} + +func (t *variantType) AssignValue(reflect.Value, Value) error { + panic("cannot assign value to a parquet VARIANT type") +} + +func (t *variantType) ConvertValue(Value, Type) (Value, error) { + panic("cannot convert value to a parquet VARIANT type") +} diff --git a/vendor/github.com/parquet-go/parquet-go/value.go b/vendor/github.com/parquet-go/parquet-go/value.go index 9281493dd16..ef04c0a1be3 100644 --- a/vendor/github.com/parquet-go/parquet-go/value.go +++ b/vendor/github.com/parquet-go/parquet-go/value.go @@ -336,7 +336,15 @@ func makeValue(k Kind, lt *format.LogicalType, v reflect.Value) Value { case FixedLenByteArray: switch v.Kind() { - case reflect.String: // uuid + case reflect.String: + if lt.UUID != nil { // uuid + uuidStr := v.String() + encoded, err := uuid.MustParse(uuidStr).MarshalBinary() + if err != nil { + panic(fmt.Errorf("error marshalling uuid: %w", err)) + } + return makeValueByteArray(k, unsafe.SliceData(encoded), len(encoded)) + } return makeValueString(k, v.String()) case reflect.Array: if v.Type().Elem().Kind() == reflect.Uint8 { diff --git a/vendor/github.com/parquet-go/parquet-go/writer.go b/vendor/github.com/parquet-go/parquet-go/writer.go index 9b742dbdc67..5326198695c 100644 --- a/vendor/github.com/parquet-go/parquet-go/writer.go +++ b/vendor/github.com/parquet-go/parquet-go/writer.go @@ -101,8 +101,14 @@ func NewGenericWriter[T any](output io.Writer, options ...WriterOption) *Generic t := typeOf[T]() var genWriteErr error + if t != nil { + if columnName, ok := validateColumns(dereference(t)); !ok { + genWriteErr = fmt.Errorf("caonnot write %v: it has columns with the same paqruet column name %q", t, columnName) + } + } + if schema == nil && t != nil { - schema = schemaOf(dereference(t)) + schema = schemaOf(dereference(t), config.SchemaConfig.StructTags...) if len(schema.Columns()) == 0 { genWriteErr = fmt.Errorf("cannot write %v: it has no columns (maybe it has no exported fields)", t) } @@ -119,7 +125,7 @@ func NewGenericWriter[T any](output io.Writer, options ...WriterOption) *Generic if genWriteErr != nil { writeFn = func(*GenericWriter[T], []T) (int, error) { return 0, genWriteErr } } else { - writeFn = writeFuncOf[T](t, config.Schema) + writeFn = writeFuncOf[T](t, config.Schema, config.SchemaConfig.StructTags) } return &GenericWriter[T]{ @@ -135,7 +141,7 @@ func NewGenericWriter[T any](output io.Writer, options ...WriterOption) *Generic type writeFunc[T any] func(*GenericWriter[T], []T) (int, error) -func writeFuncOf[T any](t reflect.Type, schema *Schema) writeFunc[T] { +func writeFuncOf[T any](t reflect.Type, schema *Schema, tagReplacements []StructTagOption) writeFunc[T] { if t == nil { return (*GenericWriter[T]).writeAny } @@ -144,22 +150,22 @@ func writeFuncOf[T any](t reflect.Type, schema *Schema) writeFunc[T] { return (*GenericWriter[T]).writeRows case reflect.Struct: - return makeWriteFunc[T](t, schema) + return makeWriteFunc[T](t, schema, tagReplacements) case reflect.Pointer: if e := t.Elem(); e.Kind() == reflect.Struct { - return makeWriteFunc[T](t, schema) + return makeWriteFunc[T](t, schema, tagReplacements) } } panic("cannot create writer for values of type " + t.String()) } -func makeWriteFunc[T any](t reflect.Type, schema *Schema) writeFunc[T] { - writeRows := writeRowsFuncOf(t, schema, nil) +func makeWriteFunc[T any](t reflect.Type, schema *Schema, tagReplacements []StructTagOption) writeFunc[T] { + writeRows := writeRowsFuncOf(t, schema, nil, tagReplacements) return func(w *GenericWriter[T], rows []T) (n int, err error) { if w.columns == nil { - w.columns = make([]ColumnBuffer, len(w.base.writer.columns)) - for i, c := range w.base.writer.columns { + w.columns = make([]ColumnBuffer, len(w.base.writer.currentRowGroup.columns)) + for i, c := range w.base.writer.currentRowGroup.columns { // These fields are usually lazily initialized when writing rows, // we need them to exist now tho. c.columnBuffer = c.newColumnBuffer() @@ -175,7 +181,15 @@ func makeWriteFunc[T any](t reflect.Type, schema *Schema) writeFunc[T] { } func (w *GenericWriter[T]) Close() error { - return w.base.Close() + if err := w.base.Close(); err != nil { + return err + } + // Nil out the columns slice to allow the column buffers to be garbage + // collected and to ensure that any subsequent use of this writer after + // Close will result in a clear panic rather than operating on closed + // resources. + w.columns = nil + return nil } func (w *GenericWriter[T]) Flush() error { @@ -186,23 +200,36 @@ func (w *GenericWriter[T]) Reset(output io.Writer) { w.base.Reset(output) } -func (w *GenericWriter[T]) Write(rows []T) (int, error) { - return w.base.writer.writeRows(len(rows), func(i, j int) (int, error) { - n, err := w.write(w, rows[i:j:j]) - if err != nil { - return n, err - } +func (w *GenericWriter[T]) Write(rows []T) (written int, err error) { + var n int + currentRowGroup := w.base.writer.currentRowGroup + for len(rows) > 0 { + n, err = currentRowGroup.writeRows(len(rows), func(i, j int) (int, error) { + n, err := w.write(w, rows[i:j:j]) + if err != nil { + return n, err + } - for _, c := range w.base.writer.columns { - if c.columnBuffer.Size() >= int64(c.bufferSize) { - if err := c.Flush(); err != nil { - return n, err + for _, c := range currentRowGroup.columns { + if c.columnBuffer != nil && c.columnBuffer.Size() >= int64(c.bufferSize) { + if err := c.Flush(); err != nil { + return n, err + } } } - } - return n, nil - }) + return n, nil + }) + rows = rows[n:] + written += n + if err != ErrTooManyRowGroups { + break + } + if err = w.base.writer.flush(); err != nil { + break + } + } + return } func (w *GenericWriter[T]) WriteRows(rows []Row) (int, error) { @@ -270,6 +297,66 @@ func (w *GenericWriter[T]) File() FileView { return w.base.File() } +// ConcurrentRowGroupWriter is a row group writer that can be used to write row groups +// in parallel. Multiple row groups can be created concurrently and written to independently, +// but they must be committed serially to maintain the order of row groups in the file. +// +// See BeginRowGroup for more information on how this can be used. +// +// While multiple row groups can be created concurrently, a single row group must be written +// sequentially. +type ConcurrentRowGroupWriter interface { + RowWriterWithSchema + + // Flush flushes any buffered data in the row group's column writers. + // This could be called before Commit to ensure all data pages are flushed. + Flush() error + + // ColumnWriters returns the column writers for this row group, allowing + // direct access to write values to individual columns. + ColumnWriters() []*ColumnWriter + + // Commit commits the row group to the parent writer, returning the number + // of rows written and an error if any. This method must be called serially + // (not concurrently) to maintain row group order in the file. + // + // If the parent writer has any pending rows buffered, they will be flushed + // before this row group is written. + // + // After Commit returns successfully, the row group will be empty and can + // be reused. + Commit() (int64, error) +} + +// BeginRowGroup returns a new ConcurrentRowGroupWriter that can be written to in parallel with +// other row groups. However these need to be committed back to the writer serially using the +// Commit method on the row group. +// +// Example usage could look something like: +// +// writer := parquet.NewGenericWriter[any](...) +// rgs := make([]parquet.ConcurrentRowGroupWriter, 5) +// var wg sync.WaitGroup +// for i := range rgs { +// rg := writer.BeginRowGroup() +// rgs[i] = rg +// wg.Add(1) +// go func() { +// defer wg.Done() +// writeChunkRows(i, rg) +// }() +// } +// wg.Wait() +// for _, rg := range rgs { +// if _, err := rg.Commit(); err != nil { +// return err +// } +// } +// return writer.Close() +func (w *GenericWriter[T]) BeginRowGroup() ConcurrentRowGroupWriter { + return newWriterRowGroup(w.base.writer, w.base.config) +} + var ( _ RowWriterWithSchema = (*GenericWriter[any])(nil) _ RowReaderFrom = (*GenericWriter[any])(nil) @@ -282,6 +369,8 @@ var ( _ RowWriterWithSchema = (*GenericWriter[map[struct{}]struct{}])(nil) _ RowReaderFrom = (*GenericWriter[map[struct{}]struct{}])(nil) _ RowGroupWriter = (*GenericWriter[map[struct{}]struct{}])(nil) + + _ ConcurrentRowGroupWriter = (*writerRowGroup)(nil) ) // Deprecated: A Writer uses a parquet schema and sequence of Go values to @@ -360,6 +449,11 @@ func (w *Writer) configure(schema *Schema) { // Close must be called after all values were produced to the writer in order to // flush all buffers and write the parquet footer. func (w *Writer) Close() error { + for _, c := range w.ColumnWriters() { + if err := c.Close(); err != nil { + return err + } + } if w.writer != nil { return w.writer.close() } @@ -444,14 +538,14 @@ func (w *Writer) WriteRowGroup(rowGroup RowGroup) (int64, error) { if err := w.writer.flush(); err != nil { return 0, err } - w.writer.configureBloomFilters(rowGroup.ColumnChunks()) + w.writer.currentRowGroup.configureBloomFilters(rowGroup.ColumnChunks()) rows := rowGroup.Rows() defer rows.Close() n, err := CopyRows(w.writer, rows) if err != nil { return n, err } - return w.writer.writeRowGroup(rowGroup.Schema(), rowGroup.SortingColumns()) + return w.writer.writeRowGroup(w.writer.currentRowGroup, rowGroup.Schema(), rowGroup.SortingColumns()) } // ReadRowsFrom reads rows from the reader passed as arguments and writes them @@ -504,7 +598,14 @@ func (w *Writer) SetKeyValueMetadata(key, value string) { // ColumnWriters returns writers for each column. This allows applications to // write values directly to each column instead of having to first assemble // values into rows to use WriteRows. -func (w *Writer) ColumnWriters() []*ColumnWriter { return w.writer.columns } +func (w *Writer) ColumnWriters() []*ColumnWriter { return w.writer.currentRowGroup.columns } + +// BeginRowGroup returns a new ConcurrentRowGroupWriter that can be written to in parallel with +// other row groups. However these need to be committed back to the writer serially using the +// Commit method on the row group. +func (w *Writer) BeginRowGroup() ConcurrentRowGroupWriter { + return newWriterRowGroup(w.writer, w.config) +} type writerFileView struct { writer *writer @@ -550,16 +651,16 @@ func (w *writerFileView) Size() int64 { } func (w *writerFileView) ColumnIndexes() []format.ColumnIndex { - return w.writer.columnIndex + return w.writer.currentRowGroup.columnIndex } func (w *writerFileView) OffsetIndexes() []format.OffsetIndex { - return w.writer.offsetIndex + return w.writer.currentRowGroup.offsetIndex } func (w *writerFileView) Root() *Column { if w.writer.fileMetaData != nil { - root, _ := openColumns(nil, w.writer.fileMetaData, w.writer.columnIndex, w.writer.offsetIndex) + root, _ := openColumns(nil, w.writer.fileMetaData, w.writer.currentRowGroup.columnIndex, w.writer.currentRowGroup.offsetIndex) return root } return nil @@ -575,83 +676,24 @@ func (w *writerFileView) RowGroups() []RowGroup { return nil } -type writer struct { - buffer *bufio.Writer - writer offsetTrackingWriter - values [][]Value - numRows int64 - maxRows int64 - - createdBy string - metadata []format.KeyValue - +type writerRowGroup struct { + writer *writer + config *WriterConfig + values [][]Value + numRows int64 + maxRows int64 columns []*ColumnWriter columnChunk []format.ColumnChunk columnIndex []format.ColumnIndex offsetIndex []format.OffsetIndex - - columnOrders []format.ColumnOrder - schemaElements []format.SchemaElement - rowGroups []format.RowGroup - columnIndexes [][]format.ColumnIndex - offsetIndexes [][]format.OffsetIndex - sortingColumns []format.SortingColumn - - fileMetaData *format.FileMetaData } -func newWriter(output io.Writer, config *WriterConfig) *writer { - w := new(writer) - if config.WriteBufferSize <= 0 { - w.writer.Reset(output) - } else { - w.buffer = bufio.NewWriterSize(output, config.WriteBufferSize) - w.writer.Reset(w.buffer) +func newWriterRowGroup(w *writer, config *WriterConfig) *writerRowGroup { + rg := &writerRowGroup{ + writer: w, + config: config, + maxRows: config.MaxRowsPerRowGroup, } - w.maxRows = config.MaxRowsPerRowGroup - w.createdBy = config.CreatedBy - w.metadata = make([]format.KeyValue, 0, len(config.KeyValueMetadata)) - for k, v := range config.KeyValueMetadata { - w.metadata = append(w.metadata, format.KeyValue{Key: k, Value: v}) - } - sortKeyValueMetadata(w.metadata) - w.sortingColumns = make([]format.SortingColumn, len(config.Sorting.SortingColumns)) - - config.Schema.forEachNode(func(name string, node Node) { - nodeType := node.Type() - - repetitionType := (*format.FieldRepetitionType)(nil) - if node != config.Schema { // the root has no repetition type - repetitionType = fieldRepetitionTypePtrOf(node) - } - // For backward compatibility with older readers, the parquet specification - // recommends to set the scale and precision on schema elements when the - // column is of logical type decimal. - logicalType := nodeType.LogicalType() - scale, precision := (*int32)(nil), (*int32)(nil) - if logicalType != nil && logicalType.Decimal != nil { - scale = &logicalType.Decimal.Scale - precision = &logicalType.Decimal.Precision - } - - typeLength := (*int32)(nil) - if n := int32(nodeType.Length()); n > 0 { - typeLength = &n - } - - w.schemaElements = append(w.schemaElements, format.SchemaElement{ - Type: nodeType.PhysicalType(), - TypeLength: typeLength, - RepetitionType: repetitionType, - Name: name, - NumChildren: int32(len(node.Fields())), - ConvertedType: nodeType.ConvertedType(), - Scale: scale, - Precision: precision, - FieldID: int32(node.ID()), - LogicalType: logicalType, - }) - }) dataPageType := format.DataPage if config.DataPageVersion == 2 { @@ -688,6 +730,7 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { pool: config.ColumnPageBuffers, columnPath: leaf.path, columnType: columnType, + originalType: columnType, columnIndex: columnType.NewColumnIndexer(config.ColumnIndexSizeLimit), columnFilter: searchBloomFilterColumn(config.BloomFilters, leaf.path), compression: compression, @@ -707,7 +750,8 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { // compressed, the data pages are encoded with the hybrid // RLE/Bit-Pack encoding which doesn't benefit from an extra // compression layer. - isCompressed: isCompressed(compression) && (dataPageType != format.DataPageV2 || dictionary == nil), + isCompressed: isCompressed(compression) && (dataPageType != format.DataPageV2 || dictionary == nil), + dictionaryMaxBytes: config.DictionaryMaxBytes, } c.header.encoder.Reset(c.header.protocol.NewWriter(&c.buffers.header)) @@ -721,37 +765,29 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { } c.encoding = encoding + c.originalEncoding = encoding c.encodings = addEncoding(c.encodings, c.encoding.Encoding()) sortPageEncodings(c.encodings) - w.columns = append(w.columns, c) - - if sortingIndex := searchSortingColumn(config.Sorting.SortingColumns, leaf.path); sortingIndex < len(w.sortingColumns) { - w.sortingColumns[sortingIndex] = format.SortingColumn{ - ColumnIdx: int32(leaf.columnIndex), - Descending: config.Sorting.SortingColumns[sortingIndex].Descending(), - NullsFirst: config.Sorting.SortingColumns[sortingIndex].NullsFirst(), - } - } + rg.columns = append(rg.columns, c) }) // Pre-allocate the backing array so that in most cases where the rows // contain a single value we will hit collocated memory areas when writing // rows to the writer. This won't benefit repeated columns much but in that // case we would just waste a bit of memory which we can afford. - values := make([]Value, len(w.columns)) - w.values = make([][]Value, len(w.columns)) + values := make([]Value, len(rg.columns)) + rg.values = make([][]Value, len(rg.columns)) for i := range values { - w.values[i] = values[i : i : i+1] + rg.values[i] = values[i : i : i+1] } - w.columnChunk = make([]format.ColumnChunk, len(w.columns)) - w.columnIndex = make([]format.ColumnIndex, len(w.columns)) - w.offsetIndex = make([]format.OffsetIndex, len(w.columns)) - w.columnOrders = make([]format.ColumnOrder, len(w.columns)) + rg.columnChunk = make([]format.ColumnChunk, len(rg.columns)) + rg.columnIndex = make([]format.ColumnIndex, len(rg.columns)) + rg.offsetIndex = make([]format.OffsetIndex, len(rg.columns)) - for i, c := range w.columns { - w.columnChunk[i] = format.ColumnChunk{ + for i, c := range rg.columns { + rg.columnChunk[i] = format.ColumnChunk{ MetaData: format.ColumnMetaData{ Type: format.Type(c.columnType.Kind()), Encoding: c.encodings, @@ -762,12 +798,205 @@ func newWriter(output io.Writer, config *WriterConfig) *writer { } } - for i, c := range w.columns { - c.columnChunk = &w.columnChunk[i] - c.offsetIndex = &w.offsetIndex[i] + for i, c := range rg.columns { + c.columnChunk = &rg.columnChunk[i] + c.offsetIndex = &rg.offsetIndex[i] + } + + return rg +} + +func (rg *writerRowGroup) reset() { + rg.numRows = 0 + for _, c := range rg.columns { + c.reset() + } +} + +func (rg *writerRowGroup) configureBloomFilters(columnChunks []ColumnChunk) { + for i, c := range rg.columns { + if c.columnFilter != nil { + c.resizeBloomFilter(columnChunks[i].NumValues()) + } + } +} + +func (rg *writerRowGroup) Schema() *Schema { + return rg.config.Schema +} + +func (rg *writerRowGroup) ColumnWriters() []*ColumnWriter { + return rg.columns +} + +func (rg *writerRowGroup) Flush() error { + for _, c := range rg.columns { + if err := c.Flush(); err != nil { + return err + } + } + return nil +} + +func (rg *writerRowGroup) Commit() (int64, error) { + if err := rg.writer.flush(); err != nil { + return 0, err } + return rg.writer.writeRowGroup(rg, nil, nil) +} + +func (rg *writerRowGroup) WriteRows(rows []Row) (int, error) { + return rg.writeRows(len(rows), func(start, end int) (int, error) { + defer func() { + for i, values := range rg.values { + clearValues(values) + rg.values[i] = values[:0] + } + }() + + // TODO: if an error occurs in this method the writer may be left in an + // partially functional state. Applications are not expected to continue + // using the writer after getting an error, but maybe we could ensure that + // we are preventing further use as well? + for _, row := range rows[start:end] { + for columnIndex, columnValues := range row.Range { + rg.values[columnIndex] = append(rg.values[columnIndex], columnValues...) + } + } - for i, c := range w.columns { + for i, values := range rg.values { + if len(values) > 0 { + if _, err := rg.columns[i].WriteRowValues(values); err != nil { + return 0, err + } + } + } + + return end - start, nil + }) +} + +func (rg *writerRowGroup) writeRows(numRows int, write func(i, j int) (int, error)) (int, error) { + written := 0 + + for written < numRows { + remain := rg.maxRows - rg.numRows + length := numRows - written + + if remain <= 0 { + return written, ErrTooManyRowGroups + } + + if remain < int64(length) { + length = int(remain) + } + + // Since the writer cannot flush pages across row boundaries, calls to + // WriteRows with very large slices can result in greatly exceeding the + // target page size. To set a limit to the impact of these large writes + // we chunk the input in slices of 64 rows. + const maxRowsPerWrite = 64 + if length > maxRowsPerWrite { + length = maxRowsPerWrite + } + + n, err := write(written, written+length) + written += n + rg.numRows += int64(n) + if err != nil { + return written, err + } + } + + return written, nil +} + +type writer struct { + buffer *bufio.Writer + writer offsetTrackingWriter + currentRowGroup *writerRowGroup + + createdBy string + metadata []format.KeyValue + + columnOrders []format.ColumnOrder + schemaElements []format.SchemaElement + rowGroups []format.RowGroup + columnIndexes [][]format.ColumnIndex + offsetIndexes [][]format.OffsetIndex + sortingColumns []format.SortingColumn + + fileMetaData *format.FileMetaData +} + +func newWriter(output io.Writer, config *WriterConfig) *writer { + w := new(writer) + if config.WriteBufferSize <= 0 { + w.writer.Reset(output) + } else { + w.buffer = bufio.NewWriterSize(output, config.WriteBufferSize) + w.writer.Reset(w.buffer) + } + w.createdBy = config.CreatedBy + w.metadata = make([]format.KeyValue, 0, len(config.KeyValueMetadata)) + for k, v := range config.KeyValueMetadata { + w.metadata = append(w.metadata, format.KeyValue{Key: k, Value: v}) + } + sortKeyValueMetadata(w.metadata) + w.sortingColumns = make([]format.SortingColumn, len(config.Sorting.SortingColumns)) + + config.Schema.forEachNode(func(name string, node Node) { + nodeType := node.Type() + + repetitionType := (*format.FieldRepetitionType)(nil) + if node != config.Schema { // the root has no repetition type + repetitionType = fieldRepetitionTypePtrOf(node) + } + // For backward compatibility with older readers, the parquet specification + // recommends to set the scale and precision on schema elements when the + // column is of logical type decimal. + logicalType := nodeType.LogicalType() + scale, precision := (*int32)(nil), (*int32)(nil) + if logicalType != nil && logicalType.Decimal != nil { + scale = &logicalType.Decimal.Scale + precision = &logicalType.Decimal.Precision + } + + typeLength := (*int32)(nil) + if n := int32(nodeType.Length()); n > 0 { + typeLength = &n + } + + w.schemaElements = append(w.schemaElements, format.SchemaElement{ + Type: nodeType.PhysicalType(), + TypeLength: typeLength, + RepetitionType: repetitionType, + Name: name, + NumChildren: int32(len(node.Fields())), + ConvertedType: nodeType.ConvertedType(), + Scale: scale, + Precision: precision, + FieldID: int32(node.ID()), + LogicalType: logicalType, + }) + }) + + w.currentRowGroup = newWriterRowGroup(w, config) + + if len(config.Sorting.SortingColumns) > 0 { + forEachLeafColumnOf(config.Schema, func(leaf leafColumn) { + if sortingIndex := searchSortingColumn(config.Sorting.SortingColumns, leaf.path); sortingIndex < len(w.sortingColumns) { + w.sortingColumns[sortingIndex] = format.SortingColumn{ + ColumnIdx: int32(leaf.columnIndex), + Descending: config.Sorting.SortingColumns[sortingIndex].Descending(), + NullsFirst: config.Sorting.SortingColumns[sortingIndex].NullsFirst(), + } + } + }) + } + + w.columnOrders = make([]format.ColumnOrder, len(w.currentRowGroup.columns)) + for i, c := range w.currentRowGroup.columns { w.columnOrders[i] = *c.columnType.ColumnOrder() } @@ -781,9 +1010,7 @@ func (w *writer) reset(writer io.Writer) { w.buffer.Reset(writer) w.writer.Reset(w.buffer) } - for _, c := range w.columns { - c.reset() - } + w.currentRowGroup.reset() for i := range w.rowGroups { w.rowGroups[i] = format.RowGroup{} } @@ -816,7 +1043,7 @@ func (w *writer) close() error { } func (w *writer) flush() error { - _, err := w.writeRowGroup(nil, nil) + _, err := w.writeRowGroup(w.currentRowGroup, nil, nil) return err } @@ -831,14 +1058,6 @@ func (w *writer) writeFileHeader() error { return nil } -func (w *writer) configureBloomFilters(columnChunks []ColumnChunk) { - for i, c := range w.columns { - if c.columnFilter != nil { - c.resizeBloomFilter(columnChunks[i].NumValues()) - } - } -} - func (w *writer) writeFileFooter() error { // The page index is composed of two sections: column and offset indexes. // They are written after the row groups, right before the footer (which @@ -914,11 +1133,11 @@ func (w *writer) writeFileFooter() error { return err } -func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns []SortingColumn) (int64, error) { - if len(w.columns) == 0 { +func (w *writer) writeRowGroup(rg *writerRowGroup, rowGroupSchema *Schema, rowGroupSortingColumns []SortingColumn) (int64, error) { + if len(rg.columns) == 0 { return 0, nil } - numRows := w.columns[0].totalRowCount() + numRows := rg.columns[0].totalRowCount() if numRows == 0 { return 0, nil } @@ -928,16 +1147,10 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] } defer func() { - w.numRows = 0 - for _, c := range w.columns { - c.reset() - } - for i := range w.columnIndex { - w.columnIndex[i] = format.ColumnIndex{} - } + rg.reset() }() - for _, c := range w.columns { + for _, c := range rg.columns { if err := c.Flush(); err != nil { return 0, err } @@ -951,8 +1164,8 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] } fileOffset := w.writer.offset - for i, c := range w.columns { - w.columnIndex[i] = format.ColumnIndex(c.columnIndex.ColumnIndex()) + for i, c := range rg.columns { + rg.columnIndex[i] = format.ColumnIndex(c.columnIndex.ColumnIndex()) if c.dictionary != nil { c.columnChunk.MetaData.DictionaryPageOffset = w.writer.offset @@ -961,6 +1174,11 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] } } + // Skip columns with nil pageBuffer (e.g., empty struct groups with no leaf columns) + if c.pageBuffer == nil { + continue + } + dataPageOffset := w.writer.offset c.columnChunk.MetaData.DataPageOffset = dataPageOffset for j := range c.offsetIndex.PageLocations { @@ -977,7 +1195,7 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] } } - for _, c := range w.columns { + for _, c := range rg.columns { if len(c.filter) > 0 { c.columnChunk.MetaData.BloomFilterOffset = w.writer.offset if err := c.writeBloomFilter(&w.writer); err != nil { @@ -989,8 +1207,8 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] totalByteSize := int64(0) totalCompressedSize := int64(0) - for i := range w.columnChunk { - c := &w.columnChunk[i].MetaData + for i := range rg.columnChunk { + c := &rg.columnChunk[i].MetaData sortPageEncodingStats(c.EncodingStats) totalByteSize += int64(c.TotalUncompressedSize) totalCompressedSize += int64(c.TotalCompressedSize) @@ -1010,25 +1228,25 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] }) } - columns := make([]format.ColumnChunk, len(w.columnChunk)) - copy(columns, w.columnChunk) + columns := make([]format.ColumnChunk, len(rg.columnChunk)) + copy(columns, rg.columnChunk) - columnIndex := make([]format.ColumnIndex, len(w.columnIndex)) - copy(columnIndex, w.columnIndex) + columnIndex := make([]format.ColumnIndex, len(rg.columnIndex)) + copy(columnIndex, rg.columnIndex) - offsetIndex := make([]format.OffsetIndex, len(w.offsetIndex)) - copy(offsetIndex, w.offsetIndex) + offsetIndex := make([]format.OffsetIndex, len(rg.offsetIndex)) + copy(offsetIndex, rg.offsetIndex) for i := range columns { c := &columns[i] c.MetaData.EncodingStats = make([]format.PageEncodingStats, len(c.MetaData.EncodingStats)) - copy(c.MetaData.EncodingStats, w.columnChunk[i].MetaData.EncodingStats) + copy(c.MetaData.EncodingStats, rg.columnChunk[i].MetaData.EncodingStats) } for i := range offsetIndex { c := &offsetIndex[i] c.PageLocations = make([]format.PageLocation, len(c.PageLocations)) - copy(c.PageLocations, w.offsetIndex[i].PageLocations) + copy(c.PageLocations, rg.offsetIndex[i].PageLocations) } w.rowGroups = append(w.rowGroups, format.RowGroup{ @@ -1046,85 +1264,26 @@ func (w *writer) writeRowGroup(rowGroupSchema *Schema, rowGroupSortingColumns [] return numRows, nil } -func (w *writer) WriteRows(rows []Row) (int, error) { - return w.writeRows(len(rows), func(start, end int) (int, error) { - defer func() { - for i, values := range w.values { - clearValues(values) - w.values[i] = values[:0] - } - }() - - // TODO: if an error occurs in this method the writer may be left in an - // partially functional state. Applications are not expected to continue - // using the writer after getting an error, but maybe we could ensure that - // we are preventing further use as well? - for _, row := range rows[start:end] { - row.Range(func(columnIndex int, columnValues []Value) bool { - w.values[columnIndex] = append(w.values[columnIndex], columnValues...) - return true - }) - } - - for i, values := range w.values { - if len(values) > 0 { - if _, err := w.columns[i].WriteRowValues(values); err != nil { - return 0, err - } - } - } - - return end - start, nil - }) -} - -func (w *writer) writeRows(numRows int, write func(i, j int) (int, error)) (int, error) { - written := 0 - - for written < numRows { - remain := w.maxRows - w.numRows - length := numRows - written - - if remain == 0 { - remain = w.maxRows - - if err := w.flush(); err != nil { - return written, err - } - } - - if remain < int64(length) { - length = int(remain) - } - - // Since the writer cannot flush pages across row boundaries, calls to - // WriteRows with very large slices can result in greatly exceeding the - // target page size. To set a limit to the impact of these large writes - // we chunk the input in slices of 64 rows. - // - // Note that this mechanism isn't perfect; for example, values may hold - // large byte slices which could still cause the column buffers to grow - // beyond the target page size. - const maxRowsPerWrite = 64 - if length > maxRowsPerWrite { - length = maxRowsPerWrite - } - - n, err := write(written, written+length) +func (w *writer) WriteRows(rows []Row) (written int, err error) { + var n int + for len(rows) > 0 { + n, err = w.currentRowGroup.WriteRows(rows) + rows = rows[n:] written += n - w.numRows += int64(n) - if err != nil { - return written, err + if err != ErrTooManyRowGroups { + break + } + if err = w.flush(); err != nil { + break } } - - return written, nil + return } // The WriteValues method is intended to work in pair with WritePage to allow // programs to target writing values to specific columns of of the writer. func (w *writer) WriteValues(values []Value) (numValues int, err error) { - return w.columns[values[0].Column()].writeValues(values) + return w.currentRowGroup.columns[values[0].Column()].writeValues(values) } // One writerBuffers is used by each writer instance, the memory buffers here @@ -1228,14 +1387,18 @@ type ColumnWriter struct { pageBuffer io.ReadWriteSeeker numPages int - columnPath columnPath - columnType Type - columnIndex ColumnIndexer - columnBuffer ColumnBuffer - columnFilter BloomFilterColumn - encoding encoding.Encoding - compression compress.Codec - dictionary Dictionary + columnPath columnPath + columnType Type + originalType Type // Original type before any encoding changes + columnIndex ColumnIndexer + columnBuffer ColumnBuffer + plainColumnBuffer ColumnBuffer // Retained plain buffer for fallback after lazy creation + originalColumnBuffer ColumnBuffer // Original buffer to restore after row group flush + columnFilter BloomFilterColumn + encoding encoding.Encoding + originalEncoding encoding.Encoding // Original encoding before any changes + compression compress.Codec + dictionary Dictionary dataPageType format.PageType maxRepetitionLevel byte @@ -1257,11 +1420,27 @@ type ColumnWriter struct { isCompressed bool encodings []format.Encoding - columnChunk *format.ColumnChunk - offsetIndex *format.OffsetIndex + columnChunk *format.ColumnChunk + offsetIndex *format.OffsetIndex + hasSwitchedToPlain bool // Tracks if dictionary encoding was switched to PLAIN + dictionaryMaxBytes int64 // Per-column dictionary size limit + + // Pooled buffers used by optional and repeated column buffers that need + // to be released when the writer is closed. + rowsBuffer *buffer[int32] + repetitionLevelsBuffer *buffer[byte] + definitionLevelsBuffer *buffer[byte] } func (c *ColumnWriter) reset() { + if c.hasSwitchedToPlain { + c.columnType = c.originalType + c.encoding = c.originalEncoding + c.hasSwitchedToPlain = false + } + if c.originalColumnBuffer != nil { + c.columnBuffer = c.originalColumnBuffer + } if c.columnBuffer != nil { c.columnBuffer.Reset() } @@ -1307,8 +1486,29 @@ func (c *ColumnWriter) Flush() (err error) { return nil } if c.columnBuffer.Len() > 0 { + // Check dictionary size limit BEFORE writing the page + // to decide if we should switch to PLAIN for future pages + var fallbackToPlain bool + if c.dictionary != nil && !c.hasSwitchedToPlain && c.dictionaryMaxBytes > 0 { + if currentDictSize := c.dictionary.Size(); currentDictSize > c.dictionaryMaxBytes { + fallbackToPlain = true + } + } + + // Write the current buffered page (still with current encoding) defer c.columnBuffer.Reset() _, err = c.writeDataPage(c.columnBuffer.Page()) + if err != nil { + return err + } + + // After writing the page, convert to PLAIN for future pages if needed + // This avoids wasteful buffer allocation if this was the last page + if fallbackToPlain { + if err := c.fallbackDictionaryToPlain(); err != nil { + return fmt.Errorf("converting dictionary to plain: %w", err) + } + } } return err } @@ -1334,6 +1534,11 @@ func (c *ColumnWriter) flushFilterPages() (err error) { return nil } + // Skip columns with nil pageBuffer (e.g., empty struct groups with no leaf columns) + if c.pageBuffer == nil { + return nil + } + // When the filter was not allocated, the writer did not know how many // values were going to be seen and therefore could not properly size the // filter ahead of time. In this case, we read back all the pages that we @@ -1372,7 +1577,7 @@ func (c *ColumnWriter) flushFilterPages() (err error) { pageReader = rbuf } - pbuf := (*buffer)(nil) + pbuf := (*buffer[byte])(nil) defer func() { if pbuf != nil { pbuf.unref() @@ -1431,9 +1636,19 @@ func (c *ColumnWriter) newColumnBuffer() ColumnBuffer { column := c.columnType.NewColumnBuffer(int(c.bufferIndex), c.columnType.EstimateNumValues(int(c.bufferSize))) switch { case c.maxRepetitionLevel > 0: - column = newRepeatedColumnBuffer(column, c.maxRepetitionLevel, c.maxDefinitionLevel, nullsGoLast) + // Since these buffers are pooled, we can afford to allocate a bit more memory in + // order to reduce the risk of needing to resize the buffer. + size := int(float64(column.Cap()) * 1.5) + c.repetitionLevelsBuffer = buffers.get(size) + c.definitionLevelsBuffer = buffers.get(size) + column = newRepeatedColumnBuffer(column, c.repetitionLevelsBuffer.data[:0], c.definitionLevelsBuffer.data[:0], c.maxRepetitionLevel, c.maxDefinitionLevel, nullsGoLast) case c.maxDefinitionLevel > 0: - column = newOptionalColumnBuffer(column, c.maxDefinitionLevel, nullsGoLast) + // Since these buffers are pooled, we can afford to allocate a bit more memory in + // order to reduce the risk of needing to resize the buffer. + size := int(float64(column.Cap()) * 1.5) + c.rowsBuffer = indexes.get(size) + c.definitionLevelsBuffer = buffers.get(size) + column = newOptionalColumnBuffer(column, c.rowsBuffer.data[:0], c.definitionLevelsBuffer.data[:0], c.maxDefinitionLevel, nullsGoLast) } return column } @@ -1452,6 +1667,7 @@ func (c *ColumnWriter) WriteRowValues(rows []Value) (int, error) { // Lazily create the row group column so we don't need to allocate it if // rows are not written individually to the column. c.columnBuffer = c.newColumnBuffer() + c.originalColumnBuffer = c.columnBuffer } else { startingRows = int64(c.columnBuffer.Len()) } @@ -1474,6 +1690,12 @@ func (c *ColumnWriter) Close() (err error) { if err := c.Flush(); err != nil { return err } + bufferUnref(c.rowsBuffer) + bufferUnref(c.repetitionLevelsBuffer) + bufferUnref(c.definitionLevelsBuffer) + c.rowsBuffer = nil + c.repetitionLevelsBuffer = nil + c.definitionLevelsBuffer = nil c.columnBuffer = nil return nil } @@ -1481,6 +1703,10 @@ func (c *ColumnWriter) Close() (err error) { func (c *ColumnWriter) writeValues(values []Value) (numValues int, err error) { if c.columnBuffer == nil { c.columnBuffer = c.newColumnBuffer() + // Save the original dictionary-encoding buffer to restore after row group flush + if c.originalColumnBuffer == nil { + c.originalColumnBuffer = c.columnBuffer + } } return c.columnBuffer.WriteValues(values) } @@ -1685,6 +1911,36 @@ func (c *ColumnWriter) writePageTo(size int64, writeTo func(io.Writer) (int64, e return nil } +// fallbackDictionaryToPlain switches future pages from dictionary to PLAIN encoding. +// This is called when a column's dictionary size limit is exceeded. +func (c *ColumnWriter) fallbackDictionaryToPlain() error { + // Switch to PLAIN encoding for future writes + // Get the underlying type without the indexed wrapper + if indexedType, ok := c.columnType.(*indexedType); ok { + c.columnType = indexedType.Type + } + if c.plainColumnBuffer == nil { + c.plainColumnBuffer = c.columnType.NewColumnBuffer(int(c.bufferIndex), int(c.bufferSize)) + } + c.columnBuffer = c.plainColumnBuffer + c.encoding = &plain.Encoding{} + c.encodings = addEncoding(c.encodings, format.Plain) + // DON'T clear the dictionary reference! + // We need to keep it so: + // 1. The dictionary page can be written (required for existing dict-encoded pages) + // 2. Existing pages that were written with dictionary indexes can be read + // + // The hasSwitchedToPlain flag prevents new values from being added to the dictionary + // + // Note: We are NOT re-encoding existing pages. This means: + // - Pages written before the limit was hit will remain dictionary-encoded + // - Pages written after will be PLAIN-encoded + // - The dictionary page will still be written at row group flush + // - Mixed encodings in the same column chunk are valid per Parquet spec + c.hasSwitchedToPlain = true + return nil +} + func (c *ColumnWriter) makePageStatistics(page Page) format.Statistics { numNulls := page.NumNulls() minValue, maxValue, _ := page.Bounds() diff --git a/vendor/modules.txt b/vendor/modules.txt index 42cacff28be..94bf0632ad6 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1132,8 +1132,12 @@ github.com/opentracing/opentracing-go/log github.com/openzipkin/zipkin-go/model github.com/openzipkin/zipkin-go/proto/zipkin_proto3 github.com/openzipkin/zipkin-go/reporter -# github.com/parquet-go/parquet-go v0.25.2-0.20250911172247-41fe9a8fbd81 -## explicit; go 1.23 +# github.com/parquet-go/bitpack v0.1.0 +## explicit; go 1.24.0 +github.com/parquet-go/bitpack +github.com/parquet-go/bitpack/unsafecast +# github.com/parquet-go/parquet-go v0.25.2-0.20251113212313-bb7dcf6d014e +## explicit; go 1.24.9 github.com/parquet-go/parquet-go github.com/parquet-go/parquet-go/bloom github.com/parquet-go/parquet-go/bloom/xxhash @@ -1156,10 +1160,8 @@ github.com/parquet-go/parquet-go/format github.com/parquet-go/parquet-go/hashprobe github.com/parquet-go/parquet-go/hashprobe/aeshash github.com/parquet-go/parquet-go/hashprobe/wyhash -github.com/parquet-go/parquet-go/internal/bitpack github.com/parquet-go/parquet-go/internal/bytealg github.com/parquet-go/parquet-go/internal/debug -github.com/parquet-go/parquet-go/internal/unsafecast github.com/parquet-go/parquet-go/sparse # github.com/pelletier/go-toml/v2 v2.2.4 ## explicit; go 1.21.0