Skip to content

Commit a263b55

Browse files
committed
Move SSSE3 code to a separate library, this should hopefully fix some of the compilation issues in older GCC versions (32-bit).
1 parent d188481 commit a263b55

15 files changed

+576
-421
lines changed

bootstrap

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1-
#!/bin/sh
1+
#!/bin/bash
2+
3+
set -e
4+
cd "$( dirname "${BASH_SOURCE[0]}" )"
25

36
autoreconf --install --force

postinstall

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
#!/bin/sh
1+
#!/bin/bash
22
# You should run this (as root) after installation/uninstallation to make sure the libraries, desktop entry and icon will be found.
33

44
set -e
5+
cd "$( dirname "${BASH_SOURCE[0]}" )"
56

67
if [ x"$( whoami )" != x"root" ]; then
78
echo "Error: postinstall should be run as root"

simple-uninstall

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/sh
1+
#!/bin/bash
22
# Run this if you used 'simple-build-and-install' and you want to uninstall the program again.
33

44
set -e

src/AV/FastScaler_Convert.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,5 @@ along with SimpleScreenRecorder. If not, see <http://www.gnu.org/licenses/>.
2323
void Convert_BGRA_YUV420_Fallback(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]);
2424

2525
#if SSR_USE_X86_ASM
26-
void Convert_BGRA_YUV420_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) __attribute__((__target__("sse,sse2,sse3,ssse3")));
26+
void Convert_BGRA_YUV420_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]);
2727
#endif
+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
Copyright (c) 2012-2013 Maarten Baert <[email protected]>
3+
4+
This file is part of SimpleScreenRecorder.
5+
6+
SimpleScreenRecorder is free software: you can redistribute it and/or modify
7+
it under the terms of the GNU General Public License as published by
8+
the Free Software Foundation, either version 3 of the License, or
9+
(at your option) any later version.
10+
11+
SimpleScreenRecorder is distributed in the hope that it will be useful,
12+
but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
GNU General Public License for more details.
15+
16+
You should have received a copy of the GNU General Public License
17+
along with SimpleScreenRecorder. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
20+
#include "Global.h"
21+
#include "FastScaler_Convert.h"
22+
23+
/*
24+
==== Fallback BGRA-to-YUV420 Converter ====
25+
26+
Nothing special, just plain C code. It processes blocks of 2x2 pixels of the input image and produces 2x2 Y, 1x1 U and 1x1 V values.
27+
*/
28+
29+
void Convert_BGRA_YUV420_Fallback(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
30+
Q_ASSERT(w % 2 == 0 && h % 2 == 0);
31+
32+
const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
33+
34+
for(unsigned int j = 0; j < h / 2; ++j) {
35+
const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) j * 2);
36+
const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * ((int) j * 2 + 1));
37+
uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) j * 2;
38+
uint8_t *yuv_y2 = out_data[0] + out_stride[0] * ((int) j * 2 + 1);
39+
uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
40+
uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
41+
for(unsigned int i = 0; i < w / 2; ++i) {
42+
uint32_t c[4] = {rgb1[0], rgb1[1], rgb2[0], rgb2[1]};
43+
rgb1 += 2; rgb2 += 2;
44+
int r[4] = {(int) ((c[0] >> 16) & 0xff), (int) ((c[1] >> 16) & 0xff), (int) ((c[2] >> 16) & 0xff), (int) ((c[3] >> 16) & 0xff)};
45+
int g[4] = {(int) ((c[0] >> 8) & 0xff), (int) ((c[1] >> 8) & 0xff), (int) ((c[2] >> 8) & 0xff), (int) ((c[3] >> 8) & 0xff)};
46+
int b[4] = {(int) ((c[0] ) & 0xff), (int) ((c[1] ) & 0xff), (int) ((c[2] ) & 0xff), (int) ((c[3] ) & 0xff)};
47+
yuv_y1[0] = (66 * r[0] + 129 * g[0] + 25 * b[0] + offset_y) >> 8;
48+
yuv_y1[1] = (66 * r[1] + 129 * g[1] + 25 * b[1] + offset_y) >> 8;
49+
yuv_y2[0] = (66 * r[2] + 129 * g[2] + 25 * b[2] + offset_y) >> 8;
50+
yuv_y2[1] = (66 * r[3] + 129 * g[3] + 25 * b[3] + offset_y) >> 8;
51+
yuv_y1 += 2; yuv_y2 += 2;
52+
int sr = r[0] + r[1] + r[2] + r[3];
53+
int sg = g[0] + g[1] + g[2] + g[3];
54+
int sb = b[0] + b[1] + b[2] + b[3];
55+
*yuv_u = (-38 * sr + -74 * sg + 112 * sb + offset_uv) >> 10;
56+
*yuv_v = (112 * sr + -94 * sg + -18 * sb + offset_uv) >> 10;
57+
++yuv_u; ++yuv_v;
58+
}
59+
}
60+
61+
}

src/AV/FastScaler_Convert.cpp renamed to src/AV/FastScaler_Convert_SSSE3.cpp

-60
Original file line numberDiff line numberDiff line change
@@ -22,71 +22,11 @@ along with SimpleScreenRecorder. If not, see <http://www.gnu.org/licenses/>.
2222

2323
#if SSR_USE_X86_ASM
2424

25-
#ifndef __MMX__
26-
#define __MMX__
27-
#endif
28-
#ifndef __SSE__
29-
#define __SSE__
30-
#endif
31-
#ifndef __SSE2__
32-
#define __SSE2__
33-
#endif
34-
#ifndef __SSE3__
35-
#define __SSE3__
36-
#endif
37-
#ifndef __SSSE3__
38-
#define __SSSE3__
39-
#endif
40-
4125
#include <xmmintrin.h>
4226
#include <emmintrin.h>
4327
#include <pmmintrin.h>
4428
#include <tmmintrin.h>
4529

46-
#endif
47-
48-
/*
49-
==== Fallback BGRA-to-YUV420 Converter ====
50-
51-
Nothing special, just plain C code. It processes blocks of 2x2 pixels of the input image and produces 2x2 Y, 1x1 U and 1x1 V values.
52-
*/
53-
54-
void Convert_BGRA_YUV420_Fallback(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
55-
Q_ASSERT(w % 2 == 0 && h % 2 == 0);
56-
57-
const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
58-
59-
for(unsigned int j = 0; j < h / 2; ++j) {
60-
const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) j * 2);
61-
const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * ((int) j * 2 + 1));
62-
uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) j * 2;
63-
uint8_t *yuv_y2 = out_data[0] + out_stride[0] * ((int) j * 2 + 1);
64-
uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
65-
uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
66-
for(unsigned int i = 0; i < w / 2; ++i) {
67-
uint32_t c[4] = {rgb1[0], rgb1[1], rgb2[0], rgb2[1]};
68-
rgb1 += 2; rgb2 += 2;
69-
int r[4] = {(int) ((c[0] >> 16) & 0xff), (int) ((c[1] >> 16) & 0xff), (int) ((c[2] >> 16) & 0xff), (int) ((c[3] >> 16) & 0xff)};
70-
int g[4] = {(int) ((c[0] >> 8) & 0xff), (int) ((c[1] >> 8) & 0xff), (int) ((c[2] >> 8) & 0xff), (int) ((c[3] >> 8) & 0xff)};
71-
int b[4] = {(int) ((c[0] ) & 0xff), (int) ((c[1] ) & 0xff), (int) ((c[2] ) & 0xff), (int) ((c[3] ) & 0xff)};
72-
yuv_y1[0] = (66 * r[0] + 129 * g[0] + 25 * b[0] + offset_y) >> 8;
73-
yuv_y1[1] = (66 * r[1] + 129 * g[1] + 25 * b[1] + offset_y) >> 8;
74-
yuv_y2[0] = (66 * r[2] + 129 * g[2] + 25 * b[2] + offset_y) >> 8;
75-
yuv_y2[1] = (66 * r[3] + 129 * g[3] + 25 * b[3] + offset_y) >> 8;
76-
yuv_y1 += 2; yuv_y2 += 2;
77-
int sr = r[0] + r[1] + r[2] + r[3];
78-
int sg = g[0] + g[1] + g[2] + g[3];
79-
int sb = b[0] + b[1] + b[2] + b[3];
80-
*yuv_u = (-38 * sr + -74 * sg + 112 * sb + offset_uv) >> 10;
81-
*yuv_v = (112 * sr + -94 * sg + -18 * sb + offset_uv) >> 10;
82-
++yuv_u; ++yuv_v;
83-
}
84-
}
85-
86-
}
87-
88-
#if SSR_USE_X86_ASM
89-
9030
/*
9131
==== SSSE3 BGRA-to-YUV420 Converter ====
9232

src/AV/FastScaler_Scale_Fallback.cpp

+192
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
/*
2+
Copyright (c) 2012-2013 Maarten Baert <[email protected]>
3+
4+
This file is part of SimpleScreenRecorder.
5+
6+
SimpleScreenRecorder is free software: you can redistribute it and/or modify
7+
it under the terms of the GNU General Public License as published by
8+
the Free Software Foundation, either version 3 of the License, or
9+
(at your option) any later version.
10+
11+
SimpleScreenRecorder is distributed in the hope that it will be useful,
12+
but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
GNU General Public License for more details.
15+
16+
You should have received a copy of the GNU General Public License
17+
along with SimpleScreenRecorder. If not, see <http://www.gnu.org/licenses/>.
18+
*/
19+
20+
#include "Global.h"
21+
#include "FastScaler_Scale.h"
22+
23+
#include "FastScaler_Scale_Generic.h"
24+
#include "TempBuffer.h"
25+
26+
/*
27+
==== Fallback MipMapper ====
28+
29+
Uses 'wannabe-SIMD': 4x 16-bit values in normal 64-bit registers. This works as long as overflow is avoided.
30+
Performs best on 64-bit systems, but even on 32-bit it should still be reasonably good.
31+
32+
It's important that this function is force-inlined because this allows the compiler to eliminate the inner loops for common mipmap factors.
33+
*/
34+
35+
inline __attribute__((always_inline))
36+
void MipMap_BGRA_Fallback_Dynamic(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
37+
uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) {
38+
const uint64_t mask = vec4x16(0xff);
39+
const uint64_t offset = vec4x16(1u << (mx + my - 1));
40+
unsigned int wrem = in_w & ((1u << mx) - 1);
41+
unsigned int hrem = in_h & ((1u << my) - 1);
42+
for(unsigned int out_j = 0; out_j < (in_h >> my); ++out_j) {
43+
const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my));
44+
uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
45+
for(unsigned int out_i = 0; out_i < (in_w >> mx); ++out_i) {
46+
uint64_t sum = 0;
47+
const uint32_t *in2 = in;
48+
for(unsigned int mj = 0; mj < (1u << my); ++mj) {
49+
for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
50+
uint64_t c = in2[mi];
51+
sum += ((c << 24) | c) & mask;
52+
}
53+
in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
54+
}
55+
in += (1u << mx);
56+
uint64_t q = ((sum + offset) >> (mx + my)) & mask;
57+
*(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q);
58+
}
59+
if(wrem != 0) {
60+
uint64_t sum = 0;
61+
const uint32_t *in2 = in;
62+
for(unsigned int mj = 0; mj < (1u << my); ++mj) {
63+
for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
64+
uint64_t c = in2[mi];
65+
sum += ((c << 24) | c) & mask;
66+
}
67+
uint64_t c = in2[wrem - 1];
68+
sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1));
69+
in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
70+
}
71+
uint64_t q = ((sum + offset) >> (mx + my)) & mask;
72+
*out = ((uint32_t) (q >> 24)) | ((uint32_t) q);
73+
}
74+
}
75+
if(hrem != 0) {
76+
unsigned int out_j = in_h >> my;
77+
const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my));
78+
uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
79+
for(unsigned int out_i = 0; out_i < (in_w >> mx); ++out_i) {
80+
uint64_t sum = 0;
81+
const uint32_t *in2 = in;
82+
for(unsigned int mj = 0; mj < hrem - 1; ++mj) {
83+
for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
84+
uint64_t c = in2[mi];
85+
sum += ((c << 24) | c) & mask;
86+
}
87+
in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
88+
}
89+
for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
90+
uint64_t c = in2[mi];
91+
sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1));
92+
}
93+
in += (1u << mx);
94+
uint64_t q = ((sum + offset) >> (mx + my)) & mask;
95+
*(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q);
96+
}
97+
if(wrem != 0) {
98+
uint64_t sum = 0;
99+
const uint32_t *in2 = in;
100+
for(unsigned int mj = 0; mj < hrem - 1; ++mj) {
101+
for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
102+
uint64_t c = in2[mi];
103+
sum += ((c << 24) | c) & mask;
104+
}
105+
uint64_t c = in2[wrem - 1];
106+
sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1));
107+
in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
108+
}
109+
for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
110+
uint64_t c = in2[mi];
111+
sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1));
112+
}
113+
uint64_t c = in2[wrem - 1];
114+
sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)) * ((1u << mx) - (wrem - 1));
115+
uint64_t q = ((sum + offset) >> (mx + my)) & mask;
116+
*out = ((uint32_t) (q >> 24)) | ((uint32_t) q);
117+
}
118+
}
119+
}
120+
121+
void MipMap_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
122+
uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) {
123+
Q_ASSERT(mx + my <= 8);
124+
switch((mx << 8) | my) {
125+
case 0x0000: Q_ASSERT(false); break;
126+
case 0x0001: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 1); break;
127+
case 0x0002: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 2); break;
128+
case 0x0100: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 0); break;
129+
case 0x0101: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 1); break;
130+
case 0x0102: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 2); break;
131+
case 0x0103: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 3); break;
132+
case 0x0200: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 0); break;
133+
case 0x0201: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 1); break;
134+
case 0x0202: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 2); break;
135+
case 0x0203: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 3); break;
136+
case 0x0301: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 1); break;
137+
case 0x0302: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 2); break;
138+
case 0x0303: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 3); break;
139+
default: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, mx, my); break;
140+
}
141+
}
142+
143+
/*
144+
==== Fallback Bilinear Scaler ====
145+
146+
Uses 'wannabe-SIMD' like the mipmapper. It's slightly less efficient here because of the multiplications, but still much faster than plain 32-bit integers.
147+
*/
148+
149+
void Bilinear_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
150+
unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride,
151+
unsigned int mx, unsigned int my) {
152+
Q_ASSERT(in_w > 1 && in_h > 1); //TODO// support size 1?
153+
Q_ASSERT(out_w > 1 && out_h > 1); //TODO// support size 1?
154+
Q_ASSERT(in_w < (1 << 28) && in_h < (1 << 28));
155+
Q_ASSERT(out_w < (1 << 28) && out_w < (1 << 28));
156+
157+
// precompute horizontal offsets and fractions
158+
TempBuffer<unsigned int> x_offset_table, x_fraction_table;
159+
x_offset_table.alloc(out_w);
160+
x_fraction_table.alloc(out_w);
161+
for(unsigned int out_i = 0; out_i < out_w; ++out_i) {
162+
Bilinear_MapIndex(out_i, in_w, out_w, mx, x_offset_table[out_i], x_fraction_table[out_i]);
163+
}
164+
165+
const uint64_t mask = vec4x16(0xff);
166+
const uint64_t offset = vec4x16(128);
167+
168+
// scale
169+
for(unsigned int out_j = 0; out_j < out_h; ++out_j) {
170+
unsigned int y_offset, y_fraction;
171+
Bilinear_MapIndex(out_j, in_h, out_h, my, y_offset, y_fraction);
172+
unsigned int y_fraction_inv = 256 - y_fraction;
173+
unsigned int *x_offset_ptr = x_offset_table.data(), *x_fraction_ptr = x_fraction_table.data();
174+
const uint32_t *in1 = (const uint32_t*) (in_data + in_stride * (int) y_offset);
175+
const uint32_t *in2 = (const uint32_t*) (in_data + in_stride * ((int) y_offset + 1));
176+
uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
177+
for(unsigned int out_i = 0; out_i < out_w; ++out_i) {
178+
unsigned int x_offset = *(x_offset_ptr++), x_fraction = *(x_fraction_ptr++), x_fraction_inv = 256 - x_fraction;
179+
uint64_t c[4] = {in1[x_offset], in1[x_offset + 1], in2[x_offset], in2[x_offset + 1]};
180+
uint64_t p[4] = {((c[0] << 24) | c[0]) & mask, ((c[1] << 24) | c[1]) & mask, ((c[2] << 24) | c[2]) & mask, ((c[3] << 24) | c[3]) & mask};
181+
uint64_t q[2] = {((p[0] * x_fraction_inv + p[1] * x_fraction + offset) >> 8) & mask, ((p[2] * x_fraction_inv + p[3] * x_fraction + offset) >> 8) & mask};
182+
uint64_t r = ((q[0] * y_fraction_inv + q[1] * y_fraction + offset) >> 8) & mask;
183+
*(out++) = ((uint32_t) (r >> 24)) | ((uint32_t) r);
184+
}
185+
}
186+
187+
}
188+
189+
void Scale_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
190+
unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride) {
191+
Scale_BGRA_Generic(in_w, in_h, in_data, in_stride, out_w, out_h, out_data, out_stride, MipMap_BGRA_Fallback, Bilinear_BGRA_Fallback);
192+
}

0 commit comments

Comments
 (0)