Skip to content

Commit c3e4ea7

Browse files
committed
Specialize palette impl for 6, 8 and 16 bpb
Instead of wasting a bunch of memory and CPU time, use vectors to save memory on larger sizes, and a map to lookup IDs instead of bruteforce searching them. 16 bpb benefits a crazy amount by this, with write operations becoming 100x faster. 8 bpb gets 20x faster, and to my surprise, 6 bpb gets 4x faster.
1 parent 13e9d28 commit c3e4ea7

File tree

2 files changed

+206
-57
lines changed

2 files changed

+206
-57
lines changed

lib/Palette.h

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#ifndef HAVE_PALETTE_H
2+
#define HAVE_PALETTE_H
3+
4+
template <size_t MAX_PALETTE_SIZE>
5+
class PaletteUtils final {
6+
public:
7+
static void checkSize(size_t size) {
8+
if (size > MAX_PALETTE_SIZE) {
9+
throw std::length_error("palette size should be at most " + std::to_string(MAX_PALETTE_SIZE) + " entries, but received " + std::to_string(size) + " entries");
10+
}
11+
if (size == 0) {
12+
throw std::length_error("palette cannot have a zero size");
13+
}
14+
}
15+
};
16+
17+
/*
18+
* Small palettes use fixed arrays to avoid vector indirections and allocations
19+
* Performance is more important here because the amount of wasted memory is insignificant
20+
* regardless of the number of elements in the palette.
21+
*
22+
* Lookups are done with a linear search, which is faster than using a hash map for small
23+
* numbers of elements.
24+
*/
25+
template <size_t MAX_PALETTE_SIZE, typename Block>
26+
class SmallPalette final {
27+
private:
28+
std::array<Block, MAX_PALETTE_SIZE> palette;
29+
unsigned short nextPaletteIndex = 0;
30+
31+
void initFromData(gsl::span<const Block> paletteEntries) {
32+
PaletteUtils<MAX_PALETTE_SIZE>::checkSize(paletteEntries.size());
33+
34+
memcpy(palette.data(), paletteEntries.data(), paletteEntries.size() * sizeof(Block));
35+
nextPaletteIndex = (unsigned short)paletteEntries.size();
36+
}
37+
public:
38+
SmallPalette(Block block) {
39+
palette[nextPaletteIndex++] = block;
40+
}
41+
42+
SmallPalette(std::vector<Block>& paletteEntries) {
43+
initFromData(gsl::span<const Block>(paletteEntries.data(), paletteEntries.size()));
44+
}
45+
46+
SmallPalette(gsl::span<const Block>& paletteEntries) {
47+
initFromData(paletteEntries);
48+
}
49+
50+
SmallPalette(const SmallPalette& otherArray) {
51+
memcpy(palette.data(), otherArray.palette.data(), sizeof(palette));
52+
nextPaletteIndex = otherArray.nextPaletteIndex;
53+
}
54+
55+
Block get(unsigned int offset) const {
56+
return palette[offset];
57+
}
58+
59+
void set(unsigned int offset, Block val) {
60+
palette[offset] = val;
61+
}
62+
63+
const gsl::span<const Block> getPalette() const {
64+
return gsl::span<const Block>(palette.data(), nextPaletteIndex);
65+
}
66+
67+
size_t size() const {
68+
return nextPaletteIndex;
69+
}
70+
71+
int addOrLookup(Block val) {
72+
for (int offset = 0; offset < nextPaletteIndex; ++offset) {
73+
if (palette[offset] == val) {
74+
return offset;
75+
}
76+
}
77+
78+
if (nextPaletteIndex >= MAX_PALETTE_SIZE) {
79+
return -1;
80+
}
81+
int offset = nextPaletteIndex++;
82+
palette[offset] = val;
83+
return offset;
84+
}
85+
};
86+
87+
/*
88+
* For large numbers of elements, it's common for a significant fraction of the palette's
89+
* capacity to be unused, so using a fixed array like SmallPalette would waste a lot of
90+
* memory. We use a vector instead and pay a small performance penalty.
91+
*
92+
* We use the memory we saved to instead keep a hash map of block -> offset for lookups,
93+
* which significantly improves performance for large palettes. (We only technically save
94+
* memory as long as the palette is less than half (?) full, but the performance benefits
95+
* remain in any case.)
96+
*/
97+
template <size_t MAX_PALETTE_SIZE, typename Block>
98+
class LargePalette final {
99+
private:
100+
std::vector<Block> palette;
101+
std::unordered_map<Block, unsigned int> blockToOffset;
102+
103+
void initFromData(gsl::span<const Block> paletteEntries) {
104+
PaletteUtils<MAX_PALETTE_SIZE>::checkSize(paletteEntries.size());
105+
106+
palette = std::vector<Block>(paletteEntries.begin(), paletteEntries.end());
107+
for (unsigned int i = 0; i < palette.size(); ++i) {
108+
blockToOffset[palette[i]] = i;
109+
}
110+
}
111+
112+
public:
113+
LargePalette(Block block) {
114+
palette.push_back(block);
115+
blockToOffset[block] = 0;
116+
}
117+
118+
LargePalette(std::vector<Block>& paletteEntries) {
119+
initFromData(gsl::span<const Block>(paletteEntries.data(), paletteEntries.size()));
120+
}
121+
122+
LargePalette(gsl::span<const Block>& paletteEntries) {
123+
initFromData(paletteEntries);
124+
}
125+
126+
LargePalette(const LargePalette& otherArray) {
127+
palette = otherArray.palette;
128+
blockToOffset = otherArray.blockToOffset;
129+
}
130+
131+
Block get(unsigned int offset) const {
132+
return palette[offset];
133+
}
134+
135+
void set(unsigned int offset, Block val) {
136+
palette[offset] = val;
137+
}
138+
139+
const gsl::span<const Block> getPalette() const {
140+
return gsl::span<const Block>(palette.data(), palette.size());
141+
}
142+
143+
size_t size() const {
144+
return palette.size();
145+
}
146+
147+
int addOrLookup(Block val) {
148+
auto it = blockToOffset.find(val);
149+
if (it != blockToOffset.end()) {
150+
return it->second;
151+
}
152+
153+
if (palette.size() >= MAX_PALETTE_SIZE) {
154+
return -1;
155+
}
156+
int offset = palette.size();
157+
palette.push_back(val);
158+
blockToOffset[val] = offset;
159+
return offset;
160+
}
161+
};
162+
#endif

lib/PalettedBlockArray.h

Lines changed: 44 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,13 @@
1212
#include <exception>
1313
#include <sstream>
1414
#include <string>
15+
#include <unordered_map>
1516
#include <unordered_set>
1617
#include <vector>
1718
#include <gsl/span>
1819

1920
#include "VanillaPaletteSize.h"
21+
#include "Palette.h"
2022

2123
template<typename Block>
2224
class IPalettedBlockArray {
@@ -77,16 +79,20 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
7779
static const unsigned short WORD_COUNT = Base::ARRAY_CAPACITY / BLOCKS_PER_WORD + (Base::ARRAY_CAPACITY % BLOCKS_PER_WORD ? 1 : 0);
7880

7981
static const unsigned int MAX_PALETTE_OFFSET = 1 << BITS_PER_BLOCK_INT;
82+
8083
public:
8184
static const unsigned int PAYLOAD_SIZE = WORD_COUNT * sizeof(Word);
82-
static const unsigned int MAX_PALETTE_SIZE = MAX_PALETTE_OFFSET < Base::ARRAY_CAPACITY ? MAX_PALETTE_OFFSET : Base::ARRAY_CAPACITY;
85+
static const size_t MAX_PALETTE_SIZE = MAX_PALETTE_OFFSET < Base::ARRAY_CAPACITY ? MAX_PALETTE_OFFSET : Base::ARRAY_CAPACITY;
8386
private:
84-
std::array<Word, WORD_COUNT> words;
87+
using Palette = std::conditional_t<
88+
BITS_PER_BLOCK >= VanillaPaletteSize::BPB_6,
89+
LargePalette<MAX_PALETTE_SIZE, Block>,
90+
SmallPalette<MAX_PALETTE_SIZE, Block>
91+
>;
8592

86-
//TODO: use a vector for this instead of a fixed array? might be less performant but will save memory for large formats
87-
std::array<Block, MAX_PALETTE_SIZE> palette;
93+
std::array<Word, WORD_COUNT> words;
8894

89-
unsigned short nextPaletteIndex = 0;
95+
Palette palette;
9096

9197
inline unsigned short getArrayOffset(Coord x, Coord y, Coord z) const {
9298
return
@@ -121,6 +127,8 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
121127
void locateAndReportInvalidOffset() const {
122128
//Slow path, to allow giving detailed errors when a problem has already been detected by the fast path
123129
auto blockCount = 0;
130+
const auto max = palette.size();
131+
124132
for (auto wordIdx = 0; wordIdx < words.size(); wordIdx++) {
125133
const auto word = words[wordIdx];
126134
for (
@@ -129,15 +137,15 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
129137
blockCount++, shift += BITS_PER_BLOCK_INT
130138
) {
131139
const auto offset = (word >> shift) & BLOCK_MASK;
132-
if (offset >= nextPaletteIndex) {
140+
if (offset >= max) {
133141
std::ostringstream ss;
134142

135143
const auto blockIdx = (wordIdx * BLOCKS_PER_WORD) + (shift / BITS_PER_BLOCK_INT);
136144
const auto x = (blockIdx >> (Base::COORD_BIT_SIZE * 2)) & Base::COORD_MASK;
137145
const auto z = (blockIdx >> Base::COORD_BIT_SIZE) & Base::COORD_MASK;
138146
const auto y = blockIdx & Base::COORD_MASK;
139147

140-
ss << "offset table contains invalid offset " << offset << " at position " << x << "," << y << "," << z << " (max valid offset: " << (nextPaletteIndex - 1) << ")";
148+
ss << "offset table contains invalid offset " << offset << " at position " << x << "," << y << "," << z << " (max valid offset: " << (max - 1) << ")";
141149
throw std::range_error(ss.str());
142150
}
143151
}
@@ -152,8 +160,9 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
152160
Word invalid = 0;
153161

154162
Word expected = 0;
163+
int max = palette.size();
155164
for (unsigned int shift = 0; shift < BLOCKS_PER_WORD * BITS_PER_BLOCK_INT; shift += BITS_PER_BLOCK_INT) {
156-
expected |= ((nextPaletteIndex - 1) << shift);
165+
expected |= ((max - 1) << shift);
157166
}
158167

159168
//Fast path - use carry-out vectors to detect invalid offsets
@@ -176,7 +185,7 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
176185
}
177186

178187
void validate() const {
179-
if (MAX_PALETTE_OFFSET == MAX_PALETTE_SIZE && nextPaletteIndex >= MAX_PALETTE_SIZE) {
188+
if (MAX_PALETTE_OFFSET == MAX_PALETTE_SIZE && palette.size() >= MAX_PALETTE_SIZE) {
180189
//Every possible offset representable is valid, therefore no validation is required
181190
//this is an uncommon case, but more frequent in small palettes, which is a win because small palettes are more
182191
//expensive to verify than big ones due to more bitwise operations needed to extract the offsets
@@ -194,36 +203,24 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
194203

195204
public:
196205

197-
PalettedBlockArray(Block block) {
206+
PalettedBlockArray(Block block) : palette(block) {
198207
memset(words.data(), 0, sizeof(words));
199-
palette[nextPaletteIndex++] = block;
200208
}
201209

202-
PalettedBlockArray(gsl::span<uint8_t> &wordArray, std::vector<Block> &paletteEntries) {
210+
PalettedBlockArray(gsl::span<uint8_t> &wordArray, std::vector<Block> &paletteEntries) : palette(paletteEntries) {
203211
if (wordArray.size() != sizeof(words)) {
204212
//TODO: known-size span can replace this check
205213
throw std::length_error("word array size should be exactly " + std::to_string(sizeof(words)) + " bytes for a " + std::to_string(BITS_PER_BLOCK_INT) + "bpb block array, got " + std::to_string(wordArray.size()) + " bytes");
206214
}
207-
if (paletteEntries.size() > MAX_PALETTE_SIZE) {
208-
throw std::length_error("palette size should be at most " + std::to_string(MAX_PALETTE_SIZE) + " entries for a " + std::to_string(BITS_PER_BLOCK_INT) + "bpb block array, got " + std::to_string(paletteEntries.size()) + " entries");
209-
}
210-
if (paletteEntries.size() == 0) {
211-
throw std::length_error("palette cannot have a zero size");
212-
}
213-
214215
memcpy(words.data(), wordArray.data(), sizeof(words));
215-
memcpy(palette.data(), paletteEntries.data(), paletteEntries.size() * sizeof(Block));
216-
nextPaletteIndex = (unsigned short)paletteEntries.size();
217216

218217
validate();
219218

220219
this->mayNeedGC = true;
221220
}
222221

223-
PalettedBlockArray(const PalettedBlockArray &otherArray) {
222+
PalettedBlockArray(const PalettedBlockArray &otherArray) : palette(otherArray.palette) {
224223
memcpy(words.data(), otherArray.words.data(), sizeof(words));
225-
memcpy(palette.data(), otherArray.palette.data(), sizeof(palette));
226-
nextPaletteIndex = otherArray.nextPaletteIndex;
227224
this->mayNeedGC = otherArray.mayNeedGC;
228225
}
229226

@@ -232,11 +229,11 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
232229
}
233230

234231
const gsl::span<const Block> getPalette() const {
235-
return gsl::span<const Block>(palette.data(), nextPaletteIndex);
232+
return palette.getPalette();
236233
}
237234

238235
unsigned short getPaletteSize() const {
239-
return nextPaletteIndex;
236+
return palette.size();
240237
}
241238

242239
unsigned short getMaxPaletteSize() const {
@@ -249,8 +246,8 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
249246
for (Coord x = 0; x < Base::ARRAY_DIM; ++x) {
250247
for (Coord z = 0; z < Base::ARRAY_DIM; ++z) {
251248
for (Coord y = 0; y < Base::ARRAY_DIM; ++y) {
252-
auto inserted = hasFound.insert(palette[_getPaletteOffset(x, y, z)]).second;
253-
if (inserted && hasFound.size() == getPaletteSize()) {
249+
auto inserted = hasFound.insert(palette.get(_getPaletteOffset(x, y, z))).second;
250+
if (inserted && hasFound.size() == palette.size()) {
254251
break;
255252
}
256253
}
@@ -266,37 +263,25 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
266263

267264
Block get(Coord x, Coord y, Coord z) const {
268265
unsigned short offset = _getPaletteOffset(x, y, z);
269-
assert(offset < nextPaletteIndex);
270-
return palette[offset];
266+
assert(offset < palette.size());
267+
return palette.get(offset);
271268
}
272269

273270
bool set(Coord x, Coord y, Coord z, Block val) {
274-
//TODO (suggested by sandertv): check performance when recording last written block and palette offset - might improve performance for repetetive writes
275-
276-
short offset = -1;
271+
int offset = palette.addOrLookup(val);
277272
bool needGC = true;
278-
for (short i = 0; i < nextPaletteIndex; ++i) {
279-
if (palette[i] == val) {
280-
offset = i;
281-
break;
282-
}
283-
}
284273

285274
if (offset == -1) {
286-
if (nextPaletteIndex >= MAX_PALETTE_SIZE) {
287-
if (MAX_PALETTE_SIZE < Base::ARRAY_CAPACITY || this->mayNeedGC) {
288-
return false;
289-
}
290-
//overwrite existing offset on fully used, non-dirty palette
291-
offset = _getPaletteOffset(x, y, z);
292-
//we skip GC because:
293-
//- we know this block isn't already in the palette
294-
//- we know every block in the array has its own palette entry (palette full and not dirty), therefore we must be overwriting an entry that's only used by 1 block anyway.
295-
needGC = false;
296-
} else {
297-
offset = (short)nextPaletteIndex++;
275+
if (MAX_PALETTE_SIZE < Base::ARRAY_CAPACITY || this->mayNeedGC) {
276+
return false;
298277
}
299-
palette[offset] = val;
278+
//overwrite existing offset on fully used, non-dirty palette
279+
offset = _getPaletteOffset(x, y, z);
280+
//we skip GC because:
281+
//- we know this block isn't already in the palette
282+
//- we know every block in the array has its own palette entry (palette full and not dirty), therefore we must be overwriting an entry that's only used by 1 block anyway.
283+
needGC = false;
284+
palette.set(offset, val);
300285
}
301286

302287
_setPaletteOffset(x, y, z, offset);
@@ -310,9 +295,9 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
310295

311296
void replaceAll(Block from, Block to) {
312297
//TODO: clean up any duplicates
313-
for (short i = 0; i < nextPaletteIndex; ++i) {
314-
if (palette[i] == from) {
315-
palette[i] = to;
298+
for (short i = 0; i < palette.size(); ++i) {
299+
if (palette.get(i) == from) {
300+
palette.set(i, to);
316301

317302
//don't return here, because there might be duplicated block states from previous replace operations
318303
}
@@ -335,8 +320,10 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
335320
template<typename BlockArray>
336321
void _fastUpsize(const BlockArray& otherArray) {
337322
auto otherPalette = otherArray.getPalette();
338-
nextPaletteIndex = otherPalette.size();
339-
std::copy(otherPalette.data(), otherPalette.data() + otherPalette.size(), palette.data());
323+
324+
palette.~Palette();
325+
new (&palette) Palette(otherPalette);
326+
340327
for (Coord x = 0; x < Base::ARRAY_DIM; ++x) {
341328
for (Coord z = 0; z < Base::ARRAY_DIM; ++z) {
342329
for (Coord y = 0; y < Base::ARRAY_DIM; ++y) {

0 commit comments

Comments
 (0)