foxcpp
diff --git a/‎bootstrap
+4-1 b/‎bootstrap
+4-1
diff --git a/‎postinstall
+2-1 b/‎postinstall
+2-1
diff --git a/‎simple-uninstall
+1-1 b/‎simple-uninstall
+1-1
diff --git a/‎src/AV/FastScaler_Convert.h
+1-1 b/‎src/AV/FastScaler_Convert.h
+1-1
diff --git a/‎src/AV/FastScaler_Convert_Fallback.cpp
+61 b/‎src/AV/FastScaler_Convert_Fallback.cpp
+61
diff --git a/‎src/AV/FastScaler_Convert.cpp renamed to ‎src/AV/FastScaler_Convert_SSSE3.cpp
-60 b/‎src/AV/FastScaler_Convert.cpp renamed to ‎src/AV/FastScaler_Convert_SSSE3.cpp
-60
diff --git a/‎src/AV/FastScaler_Scale_Fallback.cpp
+192 b/‎src/AV/FastScaler_Scale_Fallback.cpp
+192
@@ -1,3 +1,6 @@
-#!/bin/sh
+#!/bin/bash
+
+set -e
+cd "$( dirname "${BASH_SOURCE[0]}" )"
 
 autoreconf --install --force
@@ -1,7 +1,8 @@
-#!/bin/sh
+#!/bin/bash
 # You should run this (as root) after installation/uninstallation to make sure the libraries, desktop entry and icon will be found.
 
 set -e
+cd "$( dirname "${BASH_SOURCE[0]}" )"
 
 if [ x"$( whoami )" != x"root" ]; then
 	echo "Error: postinstall should be run as root"
 
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # Run this if you used 'simple-build-and-install' and you want to uninstall the program again.
 
 set -e
 
@@ -23,5 +23,5 @@ along with SimpleScreenRecorder.  If not, see <http://www.gnu.org/licenses/>.
 void Convert_BGRA_YUV420_Fallback(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]);
 
 #if SSR_USE_X86_ASM
-void Convert_BGRA_YUV420_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) __attribute__((__target__("sse,sse2,sse3,ssse3")));
+void Convert_BGRA_YUV420_SSSE3(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]);
 #endif
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2012-2013 Maarten Baert <[email protected]>
+
+This file is part of SimpleScreenRecorder.
+
+SimpleScreenRecorder is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+SimpleScreenRecorder is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with SimpleScreenRecorder.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "Global.h"
+#include "FastScaler_Convert.h"
+
+/*
+==== Fallback BGRA-to-YUV420 Converter ====
+
+Nothing special, just plain C code. It processes blocks of 2x2 pixels of the input image and produces 2x2 Y, 1x1 U and 1x1 V values.
+*/
+
+void Convert_BGRA_YUV420_Fallback(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
+	Q_ASSERT(w % 2 == 0 && h % 2 == 0);
+
+	const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
+
+	for(unsigned int j = 0; j < h / 2; ++j) {
+		const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) j * 2);
+		const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * ((int) j * 2 + 1));
+		uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) j * 2;
+		uint8_t *yuv_y2 = out_data[0] + out_stride[0] * ((int) j * 2 + 1);
+		uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
+		uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
+		for(unsigned int i = 0; i < w / 2; ++i) {
+			uint32_t c[4] = {rgb1[0], rgb1[1], rgb2[0], rgb2[1]};
+			rgb1 += 2; rgb2 += 2;
+			int r[4] = {(int) ((c[0] >> 16) & 0xff), (int) ((c[1] >> 16) & 0xff), (int) ((c[2] >> 16) & 0xff), (int) ((c[3] >> 16) & 0xff)};
+			int g[4] = {(int) ((c[0] >>  8) & 0xff), (int) ((c[1] >>  8) & 0xff), (int) ((c[2] >>  8) & 0xff), (int) ((c[3] >>  8) & 0xff)};
+			int b[4] = {(int) ((c[0]      ) & 0xff), (int) ((c[1]      ) & 0xff), (int) ((c[2]      ) & 0xff), (int) ((c[3]      ) & 0xff)};
+			yuv_y1[0] = (66 * r[0] + 129 * g[0] + 25 * b[0] + offset_y) >> 8;
+			yuv_y1[1] = (66 * r[1] + 129 * g[1] + 25 * b[1] + offset_y) >> 8;
+			yuv_y2[0] = (66 * r[2] + 129 * g[2] + 25 * b[2] + offset_y) >> 8;
+			yuv_y2[1] = (66 * r[3] + 129 * g[3] + 25 * b[3] + offset_y) >> 8;
+			yuv_y1 += 2; yuv_y2 += 2;
+			int sr = r[0] + r[1] + r[2] + r[3];
+			int sg = g[0] + g[1] + g[2] + g[3];
+			int sb = b[0] + b[1] + b[2] + b[3];
+			*yuv_u = (-38 * sr + -74 * sg + 112 * sb + offset_uv) >> 10;
+			*yuv_v = (112 * sr + -94 * sg + -18 * sb + offset_uv) >> 10;
+			++yuv_u; ++yuv_v;
+		}
+	}
+
+}
@@ -22,71 +22,11 @@ along with SimpleScreenRecorder.  If not, see <http://www.gnu.org/licenses/>.
 
 #if SSR_USE_X86_ASM
 
-#ifndef __MMX__
-#define __MMX__
-#endif
-#ifndef __SSE__
-#define __SSE__
-#endif
-#ifndef __SSE2__
-#define __SSE2__
-#endif
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#ifndef __SSSE3__
-#define __SSSE3__
-#endif
-
 #include <xmmintrin.h>
 #include <emmintrin.h>
 #include <pmmintrin.h>
 #include <tmmintrin.h>
 
-#endif
-
-/*
-==== Fallback BGRA-to-YUV420 Converter ====
-
-Nothing special, just plain C code. It processes blocks of 2x2 pixels of the input image and produces 2x2 Y, 1x1 U and 1x1 V values.
-*/
-
-void Convert_BGRA_YUV420_Fallback(unsigned int w, unsigned int h, const uint8_t* in_data, int in_stride, uint8_t* const out_data[3], const int out_stride[3]) {
-	Q_ASSERT(w % 2 == 0 && h % 2 == 0);
-
-	const int offset_y = 128 + (16 << 8), offset_uv = (128 + (128 << 8)) << 2;
-
-	for(unsigned int j = 0; j < h / 2; ++j) {
-		const uint32_t *rgb1 = (const uint32_t*) (in_data + in_stride * (int) j * 2);
-		const uint32_t *rgb2 = (const uint32_t*) (in_data + in_stride * ((int) j * 2 + 1));
-		uint8_t *yuv_y1 = out_data[0] + out_stride[0] * (int) j * 2;
-		uint8_t *yuv_y2 = out_data[0] + out_stride[0] * ((int) j * 2 + 1);
-		uint8_t *yuv_u = out_data[1] + out_stride[1] * (int) j;
-		uint8_t *yuv_v = out_data[2] + out_stride[2] * (int) j;
-		for(unsigned int i = 0; i < w / 2; ++i) {
-			uint32_t c[4] = {rgb1[0], rgb1[1], rgb2[0], rgb2[1]};
-			rgb1 += 2; rgb2 += 2;
-			int r[4] = {(int) ((c[0] >> 16) & 0xff), (int) ((c[1] >> 16) & 0xff), (int) ((c[2] >> 16) & 0xff), (int) ((c[3] >> 16) & 0xff)};
-			int g[4] = {(int) ((c[0] >>  8) & 0xff), (int) ((c[1] >>  8) & 0xff), (int) ((c[2] >>  8) & 0xff), (int) ((c[3] >>  8) & 0xff)};
-			int b[4] = {(int) ((c[0]      ) & 0xff), (int) ((c[1]      ) & 0xff), (int) ((c[2]      ) & 0xff), (int) ((c[3]      ) & 0xff)};
-			yuv_y1[0] = (66 * r[0] + 129 * g[0] + 25 * b[0] + offset_y) >> 8;
-			yuv_y1[1] = (66 * r[1] + 129 * g[1] + 25 * b[1] + offset_y) >> 8;
-			yuv_y2[0] = (66 * r[2] + 129 * g[2] + 25 * b[2] + offset_y) >> 8;
-			yuv_y2[1] = (66 * r[3] + 129 * g[3] + 25 * b[3] + offset_y) >> 8;
-			yuv_y1 += 2; yuv_y2 += 2;
-			int sr = r[0] + r[1] + r[2] + r[3];
-			int sg = g[0] + g[1] + g[2] + g[3];
-			int sb = b[0] + b[1] + b[2] + b[3];
-			*yuv_u = (-38 * sr + -74 * sg + 112 * sb + offset_uv) >> 10;
-			*yuv_v = (112 * sr + -94 * sg + -18 * sb + offset_uv) >> 10;
-			++yuv_u; ++yuv_v;
-		}
-	}
-
-}
-
-#if SSR_USE_X86_ASM
-
 /*
 ==== SSSE3 BGRA-to-YUV420 Converter ====
 
 
@@ -0,0 +1,192 @@
+/*
+Copyright (c) 2012-2013 Maarten Baert <[email protected]>
+
+This file is part of SimpleScreenRecorder.
+
+SimpleScreenRecorder is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+SimpleScreenRecorder is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with SimpleScreenRecorder.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "Global.h"
+#include "FastScaler_Scale.h"
+
+#include "FastScaler_Scale_Generic.h"
+#include "TempBuffer.h"
+
+/*
+==== Fallback MipMapper ====
+
+Uses 'wannabe-SIMD': 4x 16-bit values in normal 64-bit registers. This works as long as overflow is avoided.
+Performs best on 64-bit systems, but even on 32-bit it should still be reasonably good.
+
+It's important that this function is force-inlined because this allows the compiler to eliminate the inner loops for common mipmap factors.
+*/
+
+inline __attribute__((always_inline))
+void MipMap_BGRA_Fallback_Dynamic(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+								  uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) {
+	const uint64_t mask = vec4x16(0xff);
+	const uint64_t offset = vec4x16(1u << (mx + my - 1));
+	unsigned int wrem = in_w & ((1u << mx) - 1);
+	unsigned int hrem = in_h & ((1u << my) - 1);
+	for(unsigned int out_j = 0; out_j < (in_h >> my); ++out_j) {
+		const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my));
+		uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
+		for(unsigned int out_i = 0; out_i < (in_w >> mx); ++out_i) {
+			uint64_t sum = 0;
+			const uint32_t *in2 = in;
+			for(unsigned int mj = 0; mj < (1u << my); ++mj) {
+				for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
+					uint64_t c = in2[mi];
+					sum += ((c << 24) | c) & mask;
+				}
+				in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+			}
+			in += (1u << mx);
+			uint64_t q = ((sum + offset) >> (mx + my)) & mask;
+			*(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q);
+		}
+		if(wrem != 0) {
+			uint64_t sum = 0;
+			const uint32_t *in2 = in;
+			for(unsigned int mj = 0; mj < (1u << my); ++mj) {
+				for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
+					uint64_t c = in2[mi];
+					sum += ((c << 24) | c) & mask;
+				}
+				uint64_t c = in2[wrem - 1];
+				sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1));
+				in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+			}
+			uint64_t q = ((sum + offset) >> (mx + my)) & mask;
+			*out = ((uint32_t) (q >> 24)) | ((uint32_t) q);
+		}
+	}
+	if(hrem != 0) {
+		unsigned int out_j = in_h >> my;
+		const uint32_t *in = (const uint32_t*) (in_data + in_stride * (int) (out_j << my));
+		uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
+		for(unsigned int out_i = 0; out_i < (in_w >> mx); ++out_i) {
+			uint64_t sum = 0;
+			const uint32_t *in2 = in;
+			for(unsigned int mj = 0; mj < hrem - 1; ++mj) {
+				for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
+					uint64_t c = in2[mi];
+					sum += ((c << 24) | c) & mask;
+				}
+				in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+			}
+			for(unsigned int mi = 0; mi < (1u << mx); ++mi) {
+				uint64_t c = in2[mi];
+				sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1));
+			}
+			in += (1u << mx);
+			uint64_t q = ((sum + offset) >> (mx + my)) & mask;
+			*(out++) = ((uint32_t) (q >> 24)) | ((uint32_t) q);
+		}
+		if(wrem != 0) {
+			uint64_t sum = 0;
+			const uint32_t *in2 = in;
+			for(unsigned int mj = 0; mj < hrem - 1; ++mj) {
+				for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
+					uint64_t c = in2[mi];
+					sum += ((c << 24) | c) & mask;
+				}
+				uint64_t c = in2[wrem - 1];
+				sum += (((c << 24) | c) & mask) * ((1u << mx) - (wrem - 1));
+				in2 = (const uint32_t*) ((const uint8_t*) in2 + in_stride);
+			}
+			for(unsigned int mi = 0; mi < wrem - 1; ++mi) {
+				uint64_t c = in2[mi];
+				sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1));
+			}
+			uint64_t c = in2[wrem - 1];
+			sum += (((c << 24) | c) & mask) * ((1u << my) - (hrem - 1)) * ((1u << mx) - (wrem - 1));
+			uint64_t q = ((sum + offset) >> (mx + my)) & mask;
+			*out = ((uint32_t) (q >> 24)) | ((uint32_t) q);
+		}
+	}
+}
+
+void MipMap_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+						  uint8_t* out_data, int out_stride, unsigned int mx, unsigned int my) {
+	Q_ASSERT(mx + my <= 8);
+	switch((mx << 8) | my) {
+		case 0x0000: Q_ASSERT(false); break;
+		case 0x0001: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 1); break;
+		case 0x0002: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 0, 2); break;
+		case 0x0100: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 0); break;
+		case 0x0101: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 1); break;
+		case 0x0102: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 2); break;
+		case 0x0103: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 1, 3); break;
+		case 0x0200: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 0); break;
+		case 0x0201: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 1); break;
+		case 0x0202: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 2); break;
+		case 0x0203: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 2, 3); break;
+		case 0x0301: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 1); break;
+		case 0x0302: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 2); break;
+		case 0x0303: MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, 3, 3); break;
+		default:     MipMap_BGRA_Fallback_Dynamic(in_w, in_h, in_data, in_stride, out_data, out_stride, mx, my); break;
+	}
+}
+
+/*
+==== Fallback Bilinear Scaler ====
+
+Uses 'wannabe-SIMD' like the mipmapper. It's slightly less efficient here because of the multiplications, but still much faster than plain 32-bit integers.
+*/
+
+void Bilinear_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+							unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride,
+							unsigned int mx, unsigned int my) {
+	Q_ASSERT(in_w > 1 && in_h > 1); //TODO// support size 1?
+	Q_ASSERT(out_w > 1 && out_h > 1); //TODO// support size 1?
+	Q_ASSERT(in_w < (1 << 28) && in_h < (1 << 28));
+	Q_ASSERT(out_w < (1 << 28) && out_w < (1 << 28));
+
+	// precompute horizontal offsets and fractions
+	TempBuffer<unsigned int> x_offset_table, x_fraction_table;
+	x_offset_table.alloc(out_w);
+	x_fraction_table.alloc(out_w);
+	for(unsigned int out_i = 0; out_i < out_w; ++out_i) {
+		Bilinear_MapIndex(out_i, in_w, out_w, mx, x_offset_table[out_i], x_fraction_table[out_i]);
+	}
+
+	const uint64_t mask = vec4x16(0xff);
+	const uint64_t offset = vec4x16(128);
+
+	// scale
+	for(unsigned int out_j = 0; out_j < out_h; ++out_j) {
+		unsigned int y_offset, y_fraction;
+		Bilinear_MapIndex(out_j, in_h, out_h, my, y_offset, y_fraction);
+		unsigned int y_fraction_inv = 256 - y_fraction;
+		unsigned int *x_offset_ptr = x_offset_table.data(), *x_fraction_ptr = x_fraction_table.data();
+		const uint32_t *in1 = (const uint32_t*) (in_data + in_stride * (int) y_offset);
+		const uint32_t *in2 = (const uint32_t*) (in_data + in_stride * ((int) y_offset + 1));
+		uint32_t *out = (uint32_t*) (out_data + out_stride * (int) out_j);
+		for(unsigned int out_i = 0; out_i < out_w; ++out_i) {
+			unsigned int x_offset = *(x_offset_ptr++), x_fraction = *(x_fraction_ptr++), x_fraction_inv = 256 - x_fraction;
+			uint64_t c[4] = {in1[x_offset], in1[x_offset + 1], in2[x_offset], in2[x_offset + 1]};
+			uint64_t p[4] = {((c[0] << 24) | c[0]) & mask, ((c[1] << 24) | c[1]) & mask, ((c[2] << 24) | c[2]) & mask, ((c[3] << 24) | c[3]) & mask};
+			uint64_t q[2] = {((p[0] * x_fraction_inv + p[1] * x_fraction + offset) >> 8) & mask, ((p[2] * x_fraction_inv + p[3] * x_fraction + offset) >> 8) & mask};
+			uint64_t r = ((q[0] * y_fraction_inv + q[1] * y_fraction + offset) >> 8) & mask;
+			*(out++) = ((uint32_t) (r >> 24)) | ((uint32_t) r);
+		}
+	}
+
+}
+
+void Scale_BGRA_Fallback(unsigned int in_w, unsigned int in_h, const uint8_t* in_data, int in_stride,
+						 unsigned int out_w, unsigned int out_h, uint8_t* out_data, int out_stride) {
+	Scale_BGRA_Generic(in_w, in_h, in_data, in_stride, out_w, out_h, out_data, out_stride, MipMap_BGRA_Fallback, Bilinear_BGRA_Fallback);
+}
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#!/bin/sh`
	`1`	`+#!/bin/bash`
`2`	`2`	`# Run this if you used 'simple-build-and-install' and you want to uninstall the program again.`
`3`	`3`
`4`	`4`	`set -e`