|
| 1 | +/* |
| 2 | +Copyright (c) 2012-2013 Maarten Baert <[email protected]> |
| 3 | +
|
| 4 | +This file is part of SimpleScreenRecorder. |
| 5 | +
|
| 6 | +SimpleScreenRecorder is free software: you can redistribute it and/or modify |
| 7 | +it under the terms of the GNU General Public License as published by |
| 8 | +the Free Software Foundation, either version 3 of the License, or |
| 9 | +(at your option) any later version. |
| 10 | +
|
| 11 | +SimpleScreenRecorder is distributed in the hope that it will be useful, |
| 12 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | +GNU General Public License for more details. |
| 15 | +
|
| 16 | +You should have received a copy of the GNU General Public License |
| 17 | +along with SimpleScreenRecorder. If not, see <http://www.gnu.org/licenses/>. |
| 18 | +*/ |
| 19 | + |
| 20 | +#include "Global.h" |
| 21 | +#include "FastScaler_Scale.h" |
| 22 | + |
| 23 | +#include "FastScaler_Scale_Generic.h" |
| 24 | +#include "TempBuffer.h" |
| 25 | + |
| 26 | +/* |
| 27 | +==== Fallback MipMapper ==== |
| 28 | +
|
| 29 | +Uses 'wannabe-SIMD': 4x 16-bit values in normal 64-bit registers. This works as long as overflow is avoided. |
| 30 | +Performs best on 64-bit systems, but even on 32-bit it should still be reasonably good. |
| 31 | +
|
| 32 | +It's important that this function is force-inlined because this allows the compiler to eliminate the inner loops for common mipmap factors. |
| 33 | +*/ |
| 34 | + |
| 35 | +inline __attribute__((always_inline)) |
| 36 | +void MipMap_BGRA_Fallback_Dynamic(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, |
| 37 | + uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) { |
| 38 | + const uint64_t mask = vec4x16(0xff); |
| 39 | + const uint64_t offset = vec4x16(1u << (mx + my - 1)); |
| 40 | + unsigned int wrem = in_w & ((1u << mx) - 1); |
| 41 | + unsigned int hrem = in_h & ((1u << my) - 1); |
| 42 | + for(unsigned int out_j = 0; out_j < (in_h >> my); ++out_j) { |
| 43 | + const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my)); |
| 44 | + uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j); |
| 45 | + for(unsigned int out_i = 0; out_i < (in_w >> mx); ++out_i) { |
| 46 | + uint64_t sum = 0; |
| 47 | + const uint32_t *in2 = in; |
| 48 | + for(unsigned int mj = 0; mj < (1u << my); ++mj) { |
| 49 | + for(unsigned int mi = 0; mi < (1u << mx); ++mi) { |
| 50 | + uint64_t c = in2[mi]; |
| 51 | + sum += ((c << 24) | c) & mask; |
| 52 | + } |
| 53 | + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); |
| 54 | + } |
| 55 | + in += (1u << mx); |
| 56 | + uint64_t q = ((sum + offset) >> (mx + my)) & mask; |
| 57 | + *(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q); |
| 58 | + } |
| 59 | + if(wrem != 0) { |
| 60 | + uint64_t sum = 0; |
| 61 | + const uint32_t *in2 = in; |
| 62 | + for(unsigned int mj = 0; mj < (1u << my); ++mj) { |
| 63 | + for(unsigned int mi = 0; mi < wrem - 1; ++mi) { |
| 64 | + uint64_t c = in2[mi]; |
| 65 | + sum += ((c << 24) | c) & mask; |
| 66 | + } |
| 67 | + uint64_t c = in2[wrem - 1]; |
| 68 | + sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1)); |
| 69 | + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); |
| 70 | + } |
| 71 | + uint64_t q = ((sum + offset) >> (mx + my)) & mask; |
| 72 | + *out = ((uint32_t) (q >> 24)) | ((uint32_t) q); |
| 73 | + } |
| 74 | + } |
| 75 | + if(hrem != 0) { |
| 76 | + unsigned int out_j = in_h >> my; |
| 77 | + const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my)); |
| 78 | + uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j); |
| 79 | + for(unsigned int out_i = 0; out_i < (in_w >> mx); ++out_i) { |
| 80 | + uint64_t sum = 0; |
| 81 | + const uint32_t *in2 = in; |
| 82 | + for(unsigned int mj = 0; mj < hrem - 1; ++mj) { |
| 83 | + for(unsigned int mi = 0; mi < (1u << mx); ++mi) { |
| 84 | + uint64_t c = in2[mi]; |
| 85 | + sum += ((c << 24) | c) & mask; |
| 86 | + } |
| 87 | + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); |
| 88 | + } |
| 89 | + for(unsigned int mi = 0; mi < (1u << mx); ++mi) { |
| 90 | + uint64_t c = in2[mi]; |
| 91 | + sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)); |
| 92 | + } |
| 93 | + in += (1u << mx); |
| 94 | + uint64_t q = ((sum + offset) >> (mx + my)) & mask; |
| 95 | + *(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q); |
| 96 | + } |
| 97 | + if(wrem != 0) { |
| 98 | + uint64_t sum = 0; |
| 99 | + const uint32_t *in2 = in; |
| 100 | + for(unsigned int mj = 0; mj < hrem - 1; ++mj) { |
| 101 | + for(unsigned int mi = 0; mi < wrem - 1; ++mi) { |
| 102 | + uint64_t c = in2[mi]; |
| 103 | + sum += ((c << 24) | c) & mask; |
| 104 | + } |
| 105 | + uint64_t c = in2[wrem - 1]; |
| 106 | + sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1)); |
| 107 | + in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride); |
| 108 | + } |
| 109 | + for(unsigned int mi = 0; mi < wrem - 1; ++mi) { |
| 110 | + uint64_t c = in2[mi]; |
| 111 | + sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)); |
| 112 | + } |
| 113 | + uint64_t c = in2[wrem - 1]; |
| 114 | + sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)) * ((1u << mx) - (wrem - 1)); |
| 115 | + uint64_t q = ((sum + offset) >> (mx + my)) & mask; |
| 116 | + *out = ((uint32_t) (q >> 24)) | ((uint32_t) q); |
| 117 | + } |
| 118 | + } |
| 119 | +} |
| 120 | + |
| 121 | +void MipMap_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, |
| 122 | + uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) { |
| 123 | + Q_ASSERT(mx + my <= 8); |
| 124 | + switch((mx << 8) | my) { |
| 125 | + case 0x0000: Q_ASSERT(false); break; |
| 126 | + case 0x0001: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 1); break; |
| 127 | + case 0x0002: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 2); break; |
| 128 | + case 0x0100: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 0); break; |
| 129 | + case 0x0101: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 1); break; |
| 130 | + case 0x0102: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 2); break; |
| 131 | + case 0x0103: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 3); break; |
| 132 | + case 0x0200: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 0); break; |
| 133 | + case 0x0201: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 1); break; |
| 134 | + case 0x0202: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 2); break; |
| 135 | + case 0x0203: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 3); break; |
| 136 | + case 0x0301: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 1); break; |
| 137 | + case 0x0302: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 2); break; |
| 138 | + case 0x0303: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 3); break; |
| 139 | + default: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, mx, my); break; |
| 140 | + } |
| 141 | +} |
| 142 | + |
| 143 | +/* |
| 144 | +==== Fallback Bilinear Scaler ==== |
| 145 | +
|
| 146 | +Uses 'wannabe-SIMD' like the mipmapper. It's slightly less efficient here because of the multiplications, but still much faster than plain 32-bit integers. |
| 147 | +*/ |
| 148 | + |
| 149 | +void Bilinear_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, |
| 150 | + unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride, |
| 151 | + unsigned int mx, unsigned int my) { |
| 152 | + Q_ASSERT(in_w > 1 && in_h > 1); //TODO// support size 1? |
| 153 | + Q_ASSERT(out_w > 1 && out_h > 1); //TODO// support size 1? |
| 154 | + Q_ASSERT(in_w < (1 << 28) && in_h < (1 << 28)); |
| 155 | + Q_ASSERT(out_w < (1 << 28) && out_w < (1 << 28)); |
| 156 | + |
| 157 | + // precompute horizontal offsets and fractions |
| 158 | + TempBuffer<unsigned int> x_offset_table, x_fraction_table; |
| 159 | + x_offset_table.alloc(out_w); |
| 160 | + x_fraction_table.alloc(out_w); |
| 161 | + for(unsigned int out_i = 0; out_i < out_w; ++out_i) { |
| 162 | + Bilinear_MapIndex(out_i, in_w, out_w, mx, x_offset_table[out_i], x_fraction_table[out_i]); |
| 163 | + } |
| 164 | + |
| 165 | + const uint64_t mask = vec4x16(0xff); |
| 166 | + const uint64_t offset = vec4x16(128); |
| 167 | + |
| 168 | + // scale |
| 169 | + for(unsigned int out_j = 0; out_j < out_h; ++out_j) { |
| 170 | + unsigned int y_offset, y_fraction; |
| 171 | + Bilinear_MapIndex(out_j, in_h, out_h, my, y_offset, y_fraction); |
| 172 | + unsigned int y_fraction_inv = 256 - y_fraction; |
| 173 | + unsigned int *x_offset_ptr = x_offset_table.data(), *x_fraction_ptr = x_fraction_table.data(); |
| 174 | + const uint32_t *in1 = (const uint32_t*) (in_data + in_stride * (int) y_offset); |
| 175 | + const uint32_t *in2 = (const uint32_t*) (in_data + in_stride * ((int) y_offset + 1)); |
| 176 | + uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j); |
| 177 | + for(unsigned int out_i = 0; out_i < out_w; ++out_i) { |
| 178 | + unsigned int x_offset = *(x_offset_ptr++), x_fraction = *(x_fraction_ptr++), x_fraction_inv = 256 - x_fraction; |
| 179 | + uint64_t c[4] = {in1[x_offset], in1[x_offset + 1], in2[x_offset], in2[x_offset + 1]}; |
| 180 | + uint64_t p[4] = {((c[0] << 24) | c[0]) & mask, ((c[1] << 24) | c[1]) & mask, ((c[2] << 24) | c[2]) & mask, ((c[3] << 24) | c[3]) & mask}; |
| 181 | + uint64_t q[2] = {((p[0] * x_fraction_inv + p[1] * x_fraction + offset) >> 8) & mask, ((p[2] * x_fraction_inv + p[3] * x_fraction + offset) >> 8) & mask}; |
| 182 | + uint64_t r = ((q[0] * y_fraction_inv + q[1] * y_fraction + offset) >> 8) & mask; |
| 183 | + *(out++) = ((uint32_t) (r >> 24)) | ((uint32_t) r); |
| 184 | + } |
| 185 | + } |
| 186 | + |
| 187 | +} |
| 188 | + |
| 189 | +void Scale_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride, |
| 190 | + unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride) { |
| 191 | + Scale_BGRA_Generic(in_w, in_h, in_data, in_stride, out_w, out_h, out_data, out_stride, MipMap_BGRA_Fallback, Bilinear_BGRA_Fallback); |
| 192 | +} |
0 commit comments