// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // Automatically generated file; DO NOT EDIT. #pragma once #include #include #include #include "arrow/util/dispatch.h" #include "arrow/util/ubsan.h" namespace arrow { namespace internal { namespace { using ::arrow::util::SafeLoad; template struct UnpackBits128 { using simd_batch = xsimd::make_sized_batch_t; inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) { memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 1-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 0, 1, 2, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 4, 5, 6, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 8, 9, 10, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 12, 13, 14, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 16, 17, 18, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 20, 21, 22, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 24, 25, 26, 27 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 28, 29, 30, 31 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 1; return in; } inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 2-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 0, 2, 4, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 8, 10, 12, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 16, 18, 20, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 24, 26, 28, 30 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 0, 2, 4, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 8, 10, 12, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 16, 18, 20, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 24, 26, 28, 30 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 2; return in; } inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 3-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 0, 3, 6, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 12, 15, 18, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; shifts = simd_batch{ 24, 27, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 4, 7, 10, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 16, 19, 22, 25 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 28, 0, 2, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 8, 11, 14, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 20, 23, 26, 29 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 3; return in; } inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xf; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 4-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 0, 4, 8, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 0, 4, 8, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 0, 4, 8, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; shifts = simd_batch{ 0, 4, 8, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; shifts = simd_batch{ 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 4; return in; } inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1f; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 5-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 0, 5, 10, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; shifts = simd_batch{ 20, 25, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 8, 13, 18, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 0, 1, 6, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1 }; shifts = simd_batch{ 16, 21, 26, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; shifts = simd_batch{ 4, 9, 14, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4), SafeLoad(in + 4) }; shifts = simd_batch{ 24, 0, 2, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; shifts = simd_batch{ 12, 17, 22, 27 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 5; return in; } inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3f; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 6-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 0, 6, 12, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 24, 0, 4, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) }; shifts = simd_batch{ 16, 22, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 8, 14, 20, 26 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; shifts = simd_batch{ 0, 6, 12, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) }; shifts = simd_batch{ 24, 0, 4, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) }; shifts = simd_batch{ 16, 22, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; shifts = simd_batch{ 8, 14, 20, 26 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 6; return in; } inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7f; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 7-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 0, 7, 14, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 0, 3, 10, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 24, 0, 6, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) }; shifts = simd_batch{ 20, 0, 2, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; shifts = simd_batch{ 16, 23, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; shifts = simd_batch{ 12, 19, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3 }; shifts = simd_batch{ 8, 15, 22, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; shifts = simd_batch{ 4, 11, 18, 25 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 7; return in; } inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xff; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 8-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 8; return in; } inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1ff; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 9-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5 }; shifts = simd_batch{ 0, 9, 18, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1 }; shifts = simd_batch{ 4, 13, 22, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) }; shifts = simd_batch{ 8, 17, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; shifts = simd_batch{ 12, 21, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) }; shifts = simd_batch{ 16, 0, 2, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6) }; shifts = simd_batch{ 20, 0, 6, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; shifts = simd_batch{ 0, 1, 10, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) }; shifts = simd_batch{ 0, 5, 14, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 9; return in; } inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3ff; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 10-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2 }; shifts = simd_batch{ 0, 10, 20, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) }; shifts = simd_batch{ 8, 18, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) }; shifts = simd_batch{ 16, 0, 4, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; shifts = simd_batch{ 0, 2, 12, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2 }; shifts = simd_batch{ 0, 10, 20, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) }; shifts = simd_batch{ 8, 18, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8), SafeLoad(in + 8) }; shifts = simd_batch{ 16, 0, 4, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) }; shifts = simd_batch{ 0, 2, 12, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 10; return in; } inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7ff; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 11-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) }; shifts = simd_batch{ 0, 11, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 4 to 7 words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 2), SafeLoad(in + 2) }; shifts = simd_batch{ 12, 0, 2, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 8 to 11 words = simd_batch{ SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7 }; shifts = simd_batch{ 0, 3, 14, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 12 to 15 words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; shifts = simd_batch{ 4, 15, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 16 to 19 words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6) }; shifts = simd_batch{ 16, 0, 6, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 20 to 23 words = simd_batch{ SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3 }; shifts = simd_batch{ 0, 7, 18, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 24 to 27 words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) }; shifts = simd_batch{ 8, 19, 0, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 28 to 31 words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10), SafeLoad(in + 10) }; shifts = simd_batch{ 20, 0, 10, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; in += 11; return in; } inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xfff; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; // extract 12-bit bundles 0 to 3 words = simd_batch{ SafeLoad(in + 0), SafeLoad