summaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-multimedia
diff options
context:
space:
mode:
authorAndreas Müller <schnitzeltony@googlemail.com>2017-02-28 07:57:01 +0100
committerMartin Jansa <Martin.Jansa@gmail.com>2017-04-18 14:21:42 +0200
commit8b0da4671f718abc8e263395bbb72c4dc32938ee (patch)
treeda930d4cbd5dabea61809548c6fd36127e9ce2a4 /meta-oe/recipes-multimedia
parent2c78fa9134074c8bdb3e34c7ad41e2832354bb8c (diff)
downloadmeta-openembedded-8b0da4671f718abc8e263395bbb72c4dc32938ee.tar.gz
jack: update to latest git-revision
NEON patches are upstream Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
Diffstat (limited to 'meta-oe/recipes-multimedia')
-rw-r--r--meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch496
-rw-r--r--meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch433
-rw-r--r--meta-oe/recipes-multimedia/jack/jack_git.bb8
3 files changed, 2 insertions, 935 deletions
diff --git a/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch b/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch
deleted file mode 100644
index 76ec7136b..000000000
--- a/meta-oe/recipes-multimedia/jack/jack/0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch
+++ /dev/null
@@ -1,496 +0,0 @@
1From 99785aabc685a94415fcd445345c093488e10350 Mon Sep 17 00:00:00 2001
2From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com>
3Date: Fri, 13 Jan 2017 22:42:11 +0100
4Subject: [PATCH 1/2] Add ARM-NEON acceleration for all non-dithering sample
5 conversion functions
6MIME-Version: 1.0
7Content-Type: text/plain; charset=UTF-8
8Content-Transfer-Encoding: 8bit
9
10Upstream-Status: Submitted [1]
11
12[1] https://github.com/jackaudio/jack2/pull/250
13
14Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
15---
16 common/memops.c | 356 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
17 1 file changed, 351 insertions(+), 5 deletions(-)
18
19diff --git a/common/memops.c b/common/memops.c
20index 2ff0792..8f9ece2 100644
21--- a/common/memops.c
22+++ b/common/memops.c
23@@ -42,6 +42,10 @@
24 #endif
25 #endif
26
27+#ifdef __ARM_NEON__
28+#include <arm_neon.h>
29+#endif
30+
31 /* Notes about these *_SCALING values.
32
33 the MAX_<N>BIT values are floating point. when multiplied by
34@@ -193,6 +197,35 @@ static inline __m128i float_24_sse(__m128 s)
35 }
36 #endif
37
38+
39+#ifdef __ARM_NEON__
40+
41+static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
42+{
43+ return vminq_f32(max, vmaxq_f32(s, min));
44+}
45+
46+static inline int32x4_t float_24_neon(float32x4_t s)
47+{
48+ const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
49+ const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
50+
51+ float32x4_t clipped = clip(s, lower_bound, upper_bound);
52+ float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING));
53+ return vcvtq_s32_f32(scaled);
54+}
55+
56+static inline int16x4_t float_16_neon(float32x4_t s)
57+{
58+ const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
59+ const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
60+
61+ float32x4_t clipped = clip(s, lower_bound, upper_bound);
62+ float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING));
63+ return vmovn_s32(vcvtq_s32_f32(scaled));
64+}
65+#endif
66+
67 /* Linear Congruential noise generator. From the music-dsp list
68 * less random than rand(), but good enough and 10x faster
69 */
70@@ -248,6 +281,32 @@ void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsign
71
72 void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
73 {
74+#ifdef __ARM_NEON__
75+ unsigned long unrolled = nsamples / 4;
76+ nsamples = nsamples & 3;
77+
78+ while (unrolled--) {
79+ float32x4_t samples = vld1q_f32(src);
80+ int32x4_t converted = float_24_neon(samples);
81+ int32x4_t shifted = vshlq_n_s32(converted, 8);
82+ shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted)));
83+
84+ switch(dst_skip) {
85+ case 4:
86+ vst1q_s32((int32_t*)dst, shifted);
87+ break;
88+ default:
89+ vst1q_lane_s32((int32_t*)(dst), shifted, 0);
90+ vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
91+ vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
92+ vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
93+ break;
94+ }
95+ dst += 4*dst_skip;
96+ src+= 4;
97+ }
98+#endif
99+
100 int32_t z;
101
102 while (nsamples--) {
103@@ -321,7 +380,33 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne
104 src++;
105 }
106
107-#else
108+#elif defined(__ARM_NEON__)
109+ unsigned long unrolled = nsamples / 4;
110+ nsamples = nsamples & 3;
111+
112+ while (unrolled--) {
113+ float32x4_t samples = vld1q_f32(src);
114+ int32x4_t converted = float_24_neon(samples);
115+ int32x4_t shifted = vshlq_n_s32(converted, 8);
116+
117+ switch(dst_skip) {
118+ case 4:
119+ vst1q_s32((int32_t*)dst, shifted);
120+ break;
121+ default:
122+ vst1q_lane_s32((int32_t*)(dst), shifted, 0);
123+ vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
124+ vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
125+ vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
126+ break;
127+ }
128+ dst += 4*dst_skip;
129+
130+ src+= 4;
131+ }
132+#endif
133+
134+#if !defined (__SSE2__)
135 while (nsamples--) {
136 float_24u32 (*src, *((int32_t*) dst));
137 dst += dst_skip;
138@@ -332,6 +417,38 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne
139
140 void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
141 {
142+#ifdef __ARM_NEON__
143+ float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
144+ unsigned long unrolled = nsamples / 4;
145+ while (unrolled--) {
146+ int32x4_t src128;
147+ switch(src_skip)
148+ {
149+ case 4:
150+ src128 = vld1q_s32((int32_t*)src);
151+ break;
152+ case 8:
153+ src128 = vld2q_s32((int32_t*)src).val[0];
154+ break;
155+ default:
156+ src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
157+ src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
158+ src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
159+ src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
160+ break;
161+ }
162+ src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128)));
163+ int32x4_t shifted = vshrq_n_s32(src128, 8);
164+ float32x4_t as_float = vcvtq_f32_s32(shifted);
165+ float32x4_t divided = vmulq_f32(as_float, factor);
166+ vst1q_f32(dst, divided);
167+
168+ src += 4*src_skip;
169+ dst += 4;
170+ }
171+ nsamples = nsamples & 3;
172+#endif
173+
174 /* ALERT: signed sign-extension portability !!! */
175
176 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
177@@ -389,6 +506,34 @@ void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigne
178 dst += 4;
179 }
180 nsamples = nsamples & 3;
181+#elif defined(__ARM_NEON__)
182+ unsigned long unrolled = nsamples / 4;
183+ float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
184+ while (unrolled--) {
185+ int32x4_t src128;
186+ switch(src_skip) {
187+ case 4:
188+ src128 = vld1q_s32((int32_t*)src);
189+ break;
190+ case 8:
191+ src128 = vld2q_s32((int32_t*)src).val[0];
192+ break;
193+ default:
194+ src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
195+ src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
196+ src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
197+ src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
198+ break;
199+ }
200+ int32x4_t shifted = vshrq_n_s32(src128, 8);
201+ float32x4_t as_float = vcvtq_f32_s32(shifted);
202+ float32x4_t divided = vmulq_f32(as_float, factor);
203+ vst1q_f32(dst, divided);
204+
205+ src += 4*src_skip;
206+ dst += 4;
207+ }
208+ nsamples = nsamples & 3;
209 #endif
210
211 /* ALERT: signed sign-extension portability !!! */
212@@ -403,6 +548,24 @@ void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigne
213
214 void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
215 {
216+#ifdef __ARM_NEON__
217+ unsigned long unrolled = nsamples / 4;
218+ while (unrolled--) {
219+ int32_t z[4];
220+ float32x4_t samples = vld1q_f32(src);
221+ int32x4_t converted = float_24_neon(samples);
222+ converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
223+ vst1q_s32(z, converted);
224+
225+ for (int i = 0; i != 4; ++i) {
226+ memcpy (dst, ((char*)(z+i))+1, 3);
227+ dst += dst_skip;
228+ }
229+ src += 4;
230+ }
231+ nsamples = nsamples & 3;
232+#endif
233+
234 int32_t z;
235
236 while (nsamples--) {
237@@ -426,7 +589,6 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
238 #if defined (__SSE2__) && !defined (__sun__)
239 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
240 while (nsamples >= 4) {
241- int i;
242 int32_t z[4];
243 __m128 samples = _mm_loadu_ps(src);
244 __m128i converted = float_24_sse(samples);
245@@ -447,7 +609,7 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
246 _mm_store_ss((float*)z+3, (__m128)shuffled3);
247 #endif
248
249- for (i = 0; i != 4; ++i) {
250+ for (int i = 0; i != 4; ++i) {
251 memcpy (dst, z+i, 3);
252 dst += dst_skip;
253 }
254@@ -455,6 +617,22 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
255 nsamples -= 4;
256 src += 4;
257 }
258+#elif defined(__ARM_NEON__)
259+ unsigned long unrolled = nsamples / 4;
260+ while (unrolled--) {
261+ int i;
262+ int32_t z[4];
263+ float32x4_t samples = vld1q_f32(src);
264+ int32x4_t converted = float_24_neon(samples);
265+ vst1q_s32(z, converted);
266+
267+ for (i = 0; i != 4; ++i) {
268+ memcpy (dst, z+i, 3);
269+ dst += dst_skip;
270+ }
271+ src += 4;
272+ }
273+ nsamples = nsamples & 3;
274 #endif
275
276 int32_t z;
277@@ -473,9 +651,41 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
278
279 void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
280 {
281+ const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
282+
283+#ifdef __ARM_NEON__
284+ // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
285+ const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
286+ int32_t x[4];
287+ memset(x, 0, sizeof(x));
288+ unsigned long unrolled = nsamples / 4;
289+ while (unrolled--) {
290+#if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
291+ // right aligned / inverse sequence below -> *256
292+ memcpy(((char*)&x[0])+1, src, 3);
293+ memcpy(((char*)&x[1])+1, src+src_skip, 3);
294+ memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
295+ memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
296+#else
297+ memcpy(&x[0], src, 3);
298+ memcpy(&x[1], src+src_skip, 3);
299+ memcpy(&x[2], src+2*src_skip, 3);
300+ memcpy(&x[3], src+3*src_skip, 3);
301+#endif
302+ src += 4 * src_skip;
303+
304+ int32x4_t source = vld1q_s32(x);
305+ source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source)));
306+ float32x4_t converted = vcvtq_f32_s32(source);
307+ float32x4_t scaled = vmulq_f32(converted, vscaling);
308+ vst1q_f32(dst, scaled);
309+ dst += 4;
310+ }
311+ nsamples = nsamples & 3;
312+#endif
313+
314 /* ALERT: signed sign-extension portability !!! */
315
316- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
317 while (nsamples--) {
318 int x;
319 #if __BYTE_ORDER == __LITTLE_ENDIAN
320@@ -528,6 +738,34 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l
321 dst += 4;
322 nsamples -= 4;
323 }
324+#elif defined(__ARM_NEON__)
325+ // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
326+ const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
327+ int32_t x[4];
328+ memset(x, 0, sizeof(x));
329+ unsigned long unrolled = nsamples / 4;
330+ while (unrolled--) {
331+#if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
332+ // left aligned -> *256
333+ memcpy(&x[0], src, 3);
334+ memcpy(&x[1], src+src_skip, 3);
335+ memcpy(&x[2], src+2*src_skip, 3);
336+ memcpy(&x[3], src+3*src_skip, 3);
337+#else
338+ memcpy(((char*)&x[0])+1, src, 3);
339+ memcpy(((char*)&x[1])+1, src+src_skip, 3);
340+ memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
341+ memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
342+#endif
343+ src += 4 * src_skip;
344+
345+ int32x4_t source = vld1q_s32(x);
346+ float32x4_t converted = vcvtq_f32_s32(source);
347+ float32x4_t scaled = vmulq_f32(converted, vscaling);
348+ vst1q_f32(dst, scaled);
349+ dst += 4;
350+ }
351+ nsamples = nsamples & 3;
352 #endif
353
354 while (nsamples--) {
355@@ -547,6 +785,30 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l
356
357 void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
358 {
359+#ifdef __ARM_NEON__
360+ unsigned long unrolled = nsamples / 4;
361+ nsamples = nsamples & 3;
362+
363+ while (unrolled--) {
364+ float32x4_t samples = vld1q_f32(src);
365+ int16x4_t converted = float_16_neon(samples);
366+ converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted)));
367+
368+ switch(dst_skip) {
369+ case 2:
370+ vst1_s16((int16_t*)dst, converted);
371+ break;
372+ default:
373+ vst1_lane_s16((int16_t*)(dst), converted, 0);
374+ vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
375+ vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
376+ vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
377+ break;
378+ }
379+ dst += 4*dst_skip;
380+ src+= 4;
381+ }
382+#endif
383 int16_t tmp;
384
385 while (nsamples--) {
386@@ -574,6 +836,29 @@ void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned
387
388 void sample_move_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
389 {
390+#ifdef __ARM_NEON__
391+ unsigned long unrolled = nsamples / 4;
392+ nsamples = nsamples & 3;
393+
394+ while (unrolled--) {
395+ float32x4_t samples = vld1q_f32(src);
396+ int16x4_t converted = float_16_neon(samples);
397+
398+ switch(dst_skip) {
399+ case 2:
400+ vst1_s16((int16_t*)dst, converted);
401+ break;
402+ default:
403+ vst1_lane_s16((int16_t*)(dst), converted, 0);
404+ vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
405+ vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
406+ vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
407+ break;
408+ }
409+ dst += 4*dst_skip;
410+ src+= 4;
411+ }
412+#endif
413 while (nsamples--) {
414 float_16 (*src, *((int16_t*) dst));
415 dst += dst_skip;
416@@ -728,8 +1013,39 @@ void sample_move_dither_shaped_d16_sS (char *dst, jack_default_audio_sample_t *
417
418 void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
419 {
420- short z;
421 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
422+#ifdef __ARM_NEON__
423+ const float32x4_t vscaling = vdupq_n_f32(scaling);
424+ unsigned long unrolled = nsamples / 4;
425+ while (unrolled--) {
426+ int16x4_t source16x4;
427+ switch(src_skip) {
428+ case 2:
429+ source16x4 = vld1_s16((int16_t*)src);
430+ break;
431+ case 4:
432+ source16x4 = vld2_s16((int16_t*)src).val[0];
433+ break;
434+ default:
435+ source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
436+ source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
437+ source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
438+ source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
439+ break;
440+ }
441+ source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4)));
442+ int32x4_t source32x4 = vmovl_s16(source16x4);
443+ src += 4 * src_skip;
444+
445+ float32x4_t converted = vcvtq_f32_s32(source32x4);
446+ float32x4_t scaled = vmulq_f32(converted, vscaling);
447+ vst1q_f32(dst, scaled);
448+ dst += 4;
449+ }
450+ nsamples = nsamples & 3;
451+#endif
452+
453+ short z;
454
455 /* ALERT: signed sign-extension portability !!! */
456 while (nsamples--) {
457@@ -752,6 +1068,36 @@ void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned l
458 {
459 /* ALERT: signed sign-extension portability !!! */
460 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
461+#ifdef __ARM_NEON__
462+ const float32x4_t vscaling = vdupq_n_f32(scaling);
463+ unsigned long unrolled = nsamples / 4;
464+ while (unrolled--) {
465+ int16x4_t source16x4;
466+ switch(src_skip) {
467+ case 2:
468+ source16x4 = vld1_s16((int16_t*)src);
469+ break;
470+ case 4:
471+ source16x4 = vld2_s16((int16_t*)src).val[0];
472+ break;
473+ default:
474+ source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
475+ source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
476+ source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
477+ source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
478+ break;
479+ }
480+ int32x4_t source32x4 = vmovl_s16(source16x4);
481+ src += 4 * src_skip;
482+
483+ float32x4_t converted = vcvtq_f32_s32(source32x4);
484+ float32x4_t scaled = vmulq_f32(converted, vscaling);
485+ vst1q_f32(dst, scaled);
486+ dst += 4;
487+ }
488+ nsamples = nsamples & 3;
489+#endif
490+
491 while (nsamples--) {
492 *dst = (*((short *) src)) * scaling;
493 dst++;
494--
4952.5.5
496
diff --git a/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch b/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch
deleted file mode 100644
index e0c9e8ca8..000000000
--- a/meta-oe/recipes-multimedia/jack/jack/0002-jack_simdtests-add-application-checking-accurracy-an.patch
+++ /dev/null
@@ -1,433 +0,0 @@
1From d0543c0628d2c0a6d898c694003e941fa189b393 Mon Sep 17 00:00:00 2001
2From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@googlemail.com>
3Date: Sun, 15 Jan 2017 20:52:20 +0100
4Subject: [PATCH 2/2] jack_simdtests: add application checking accurracy and
5 performance of SIMD optimizations
6MIME-Version: 1.0
7Content-Type: text/plain; charset=UTF-8
8Content-Transfer-Encoding: 8bit
9
10Upstream-Status: Submitted [1]
11
12[1] https://github.com/jackaudio/jack2/pull/250
13
14Signed-off-by: Andreas Müller <schnitzeltony@googlemail.com>
15---
16 example-clients/simdtests.cpp | 390 ++++++++++++++++++++++++++++++++++++++++++
17 example-clients/wscript | 3 +-
18 2 files changed, 392 insertions(+), 1 deletion(-)
19 create mode 100644 example-clients/simdtests.cpp
20
21diff --git a/example-clients/simdtests.cpp b/example-clients/simdtests.cpp
22new file mode 100644
23index 0000000..b74d50a
24--- /dev/null
25+++ b/example-clients/simdtests.cpp
26@@ -0,0 +1,390 @@
27+/*
28+ * simdtests.c -- test accuraccy and performance of simd optimizations
29+ *
30+ * Copyright (C) 2017 Andreas Mueller.
31+ *
32+ * This program is free software; you can redistribute it and/or modify
33+ * it under the terms of the GNU General Public License as published by
34+ * the Free Software Foundation; either version 2 of the License, or
35+ * (at your option) any later version.
36+ *
37+ * This program is distributed in the hope that it will be useful,
38+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
39+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
40+ * GNU General Public License for more details.
41+ *
42+ * You should have received a copy of the GNU General Public License
43+ * along with this program; if not, write to the Free Software
44+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
45+ */
46+
47+/* We must include all headers memops.c includes to avoid trouble with
48+ * out namespace game below.
49+ */
50+#include <stdio.h>
51+#include <string.h>
52+#include <math.h>
53+#include <memory.h>
54+#include <stdlib.h>
55+#include <stdint.h>
56+#include <limits.h>
57+#ifdef __linux__
58+#include <endian.h>
59+#endif
60+#include "memops.h"
61+
62+#if defined (__SSE2__) && !defined (__sun__)
63+#include <emmintrin.h>
64+#ifdef __SSE4_1__
65+#include <smmintrin.h>
66+#endif
67+#endif
68+
69+#ifdef __ARM_NEON__
70+#include <arm_neon.h>
71+#endif
72+
73+// our additional headers
74+#include <time.h>
75+
76+/* Dirty: include mempos.c twice the second time with SIMD disabled
77+ * so we can compare aceelerated non accelerated
78+ */
79+namespace accelerated {
80+#include "../common/memops.c"
81+}
82+
83+namespace origerated {
84+#ifdef __SSE2__
85+#undef __SSE2__
86+#endif
87+
88+#ifdef __ARM_NEON__
89+#undef __ARM_NEON__
90+#endif
91+
92+#include "../common/memops.c"
93+}
94+
95+// define conversion function types
96+typedef void (*t_jack_to_integer)(
97+ char *dst,
98+ jack_default_audio_sample_t *src,
99+ unsigned long nsamples,
100+ unsigned long dst_skip,
101+ dither_state_t *state);
102+
103+typedef void (*t_integer_to_jack)(
104+ jack_default_audio_sample_t *dst,
105+ char *src,
106+ unsigned long nsamples,
107+ unsigned long src_skip);
108+
109+// define/setup test case data
110+typedef struct test_case_data {
111+ uint32_t frame_size;
112+ uint32_t sample_size;
113+ bool reverse;
114+ t_jack_to_integer jack_to_integer_accel;
115+ t_jack_to_integer jack_to_integer_orig;
116+ t_integer_to_jack integer_to_jack_accel;
117+ t_integer_to_jack integer_to_jack_orig;
118+ dither_state_t *ditherstate;
119+ const char *name;
120+} test_case_data_t;
121+
122+test_case_data_t test_cases[] = {
123+ {
124+ 4,
125+ 3,
126+ true,
127+ accelerated::sample_move_d32u24_sSs,
128+ origerated::sample_move_d32u24_sSs,
129+ accelerated::sample_move_dS_s32u24s,
130+ origerated::sample_move_dS_s32u24s,
131+ NULL,
132+ "32u24s" },
133+ {
134+ 4,
135+ 3,
136+ false,
137+ accelerated::sample_move_d32u24_sS,
138+ origerated::sample_move_d32u24_sS,
139+ accelerated::sample_move_dS_s32u24,
140+ origerated::sample_move_dS_s32u24,
141+ NULL,
142+ "32u24" },
143+ {
144+ 3,
145+ 3,
146+ true,
147+ accelerated::sample_move_d24_sSs,
148+ origerated::sample_move_d24_sSs,
149+ accelerated::sample_move_dS_s24s,
150+ origerated::sample_move_dS_s24s,
151+ NULL,
152+ "24s" },
153+ {
154+ 3,
155+ 3,
156+ false,
157+ accelerated::sample_move_d24_sS,
158+ origerated::sample_move_d24_sS,
159+ accelerated::sample_move_dS_s24,
160+ origerated::sample_move_dS_s24,
161+ NULL,
162+ "24" },
163+ {
164+ 2,
165+ 2,
166+ true,
167+ accelerated::sample_move_d16_sSs,
168+ origerated::sample_move_d16_sSs,
169+ accelerated::sample_move_dS_s16s,
170+ origerated::sample_move_dS_s16s,
171+ NULL,
172+ "16s" },
173+ {
174+ 2,
175+ 2,
176+ false,
177+ accelerated::sample_move_d16_sS,
178+ origerated::sample_move_d16_sS,
179+ accelerated::sample_move_dS_s16,
180+ origerated::sample_move_dS_s16,
181+ NULL,
182+ "16" },
183+};
184+
185+// we need to repeat for better accuracy at time measurement
186+const uint32_t retry_per_case = 1000;
187+
188+// setup test buffers
189+#define TESTBUFF_SIZE 1024
190+jack_default_audio_sample_t jackbuffer_source[TESTBUFF_SIZE];
191+// integer buffers: max 4 bytes per value / * 2 for stereo
192+char integerbuffer_accel[TESTBUFF_SIZE*4*2];
193+char integerbuffer_orig[TESTBUFF_SIZE*4*2];
194+// float buffers
195+jack_default_audio_sample_t jackfloatbuffer_accel[TESTBUFF_SIZE];
196+jack_default_audio_sample_t jackfloatbuffer_orig[TESTBUFF_SIZE];
197+
198+// comparing unsigned makes life easier
199+uint32_t extract_integer(
200+ char* buff,
201+ uint32_t offset,
202+ uint32_t frame_size,
203+ uint32_t sample_size,
204+ bool big_endian)
205+{
206+ uint32_t retval = 0;
207+ unsigned char* curr;
208+ uint32_t mult = 1;
209+ if(big_endian) {
210+ curr = (unsigned char*)buff + offset + sample_size-1;
211+ for(uint32_t i=0; i<sample_size; i++) {
212+ retval += *(curr--) * mult;
213+ mult*=256;
214+ }
215+ }
216+ else {
217+ curr = (unsigned char*)buff + offset + frame_size-sample_size;
218+ for(uint32_t i=0; i<sample_size; i++) {
219+ retval += *(curr++) * mult;
220+ mult*=256;
221+ }
222+ }
223+ return retval;
224+}
225+
226+int main(int argc, char *argv[])
227+{
228+// parse_arguments(argc, argv);
229+ uint32_t maxerr_displayed = 10;
230+
231+ // fill jackbuffer
232+ for(int i=0; i<TESTBUFF_SIZE; i++) {
233+ // ramp
234+ jack_default_audio_sample_t value =
235+ ((jack_default_audio_sample_t)((i % TESTBUFF_SIZE) - TESTBUFF_SIZE/2)) / (TESTBUFF_SIZE/2);
236+ // force clipping
237+ value *= 1.02;
238+ jackbuffer_source[i] = value;
239+ }
240+
241+ for(uint32_t testcase=0; testcase<sizeof(test_cases)/sizeof(test_case_data_t); testcase++) {
242+ // test mono/stereo
243+ for(uint32_t channels=1; channels<=2; channels++) {
244+ //////////////////////////////////////////////////////////////////////////////
245+ // jackfloat -> integer
246+
247+ // clean target buffers
248+ memset(integerbuffer_accel, 0, sizeof(integerbuffer_accel));
249+ memset(integerbuffer_orig, 0, sizeof(integerbuffer_orig));
250+ // accel
251+ clock_t time_to_integer_accel = clock();
252+ for(uint32_t repetition=0; repetition<retry_per_case; repetition++)
253+ {
254+ test_cases[testcase].jack_to_integer_accel(
255+ integerbuffer_accel,
256+ jackbuffer_source,
257+ TESTBUFF_SIZE,
258+ test_cases[testcase].frame_size*channels,
259+ test_cases[testcase].ditherstate);
260+ }
261+ float timediff_to_integer_accel = ((float)(clock() - time_to_integer_accel)) / CLOCKS_PER_SEC;
262+ // orig
263+ clock_t time_to_integer_orig = clock();
264+ for(uint32_t repetition=0; repetition<retry_per_case; repetition++)
265+ {
266+ test_cases[testcase].jack_to_integer_orig(
267+ integerbuffer_orig,
268+ jackbuffer_source,
269+ TESTBUFF_SIZE,
270+ test_cases[testcase].frame_size*channels,
271+ test_cases[testcase].ditherstate);
272+ }
273+ float timediff_to_integer_orig = ((float)(clock() - time_to_integer_orig)) / CLOCKS_PER_SEC;
274+ // output performance results
275+ printf(
276+ "JackFloat->Integer @%7.7s/%u: Orig %7.6f sec / Accel %7.6f sec -> Win: %5.2f %%\n",
277+ test_cases[testcase].name,
278+ channels,
279+ timediff_to_integer_orig,
280+ timediff_to_integer_accel,
281+ (timediff_to_integer_orig/timediff_to_integer_accel-1)*100.0);
282+ uint32_t int_deviation_max = 0;
283+ uint32_t int_error_count = 0;
284+ // output error (avoid spam -> limit error lines per test case)
285+ for(uint32_t sample=0; sample<TESTBUFF_SIZE; sample++) {
286+ uint32_t sample_offset = sample*test_cases[testcase].frame_size*channels;
287+ // compare both results
288+ uint32_t intval_accel=extract_integer(
289+ integerbuffer_accel,
290+ sample_offset,
291+ test_cases[testcase].frame_size,
292+ test_cases[testcase].sample_size,
293+#if __BYTE_ORDER == __BIG_ENDIAN
294+ !test_cases[testcase].reverse);
295+#else
296+ test_cases[testcase].reverse);
297+#endif
298+ uint32_t intval_orig=extract_integer(
299+ integerbuffer_orig,
300+ sample_offset,
301+ test_cases[testcase].frame_size,
302+ test_cases[testcase].sample_size,
303+#if __BYTE_ORDER == __BIG_ENDIAN
304+ !test_cases[testcase].reverse);
305+#else
306+ test_cases[testcase].reverse);
307+#endif
308+ if(intval_accel != intval_orig) {
309+ if(int_error_count<maxerr_displayed) {
310+ printf("Value error sample %u:", sample);
311+ printf(" Orig 0x");
312+ char formatstr[10];
313+ sprintf(formatstr, "%%0%uX", test_cases[testcase].sample_size*2);
314+ printf(formatstr, intval_orig);
315+ printf(" Accel 0x");
316+ printf(formatstr, intval_accel);
317+ printf("\n");
318+ }
319+ int_error_count++;
320+ uint32_t int_deviation;
321+ if(intval_accel > intval_orig)
322+ int_deviation = intval_accel-intval_orig;
323+ else
324+ int_deviation = intval_orig-intval_accel;
325+ if(int_deviation > int_deviation_max)
326+ int_deviation_max = int_deviation;
327+ }
328+ }
329+ printf(
330+ "JackFloat->Integer @%7.7s/%u: Errors: %u Max deviation %u\n",
331+ test_cases[testcase].name,
332+ channels,
333+ int_error_count,
334+ int_deviation_max);
335+
336+ //////////////////////////////////////////////////////////////////////////////
337+ // integer -> jackfloat
338+
339+ // clean target buffers
340+ memset(jackfloatbuffer_accel, 0, sizeof(jackfloatbuffer_accel));
341+ memset(jackfloatbuffer_orig, 0, sizeof(jackfloatbuffer_orig));
342+ // accel
343+ clock_t time_to_float_accel = clock();
344+ for(uint32_t repetition=0; repetition<retry_per_case; repetition++)
345+ {
346+ test_cases[testcase].integer_to_jack_accel(
347+ jackfloatbuffer_accel,
348+ integerbuffer_orig,
349+ TESTBUFF_SIZE,
350+ test_cases[testcase].frame_size*channels);
351+ }
352+ float timediff_to_float_accel = ((float)(clock() - time_to_float_accel)) / CLOCKS_PER_SEC;
353+ // orig
354+ clock_t time_to_float_orig = clock();
355+ for(uint32_t repetition=0; repetition<retry_per_case; repetition++)
356+ {
357+ test_cases[testcase].integer_to_jack_orig(
358+ jackfloatbuffer_orig,
359+ integerbuffer_orig,
360+ TESTBUFF_SIZE,
361+ test_cases[testcase].frame_size*channels);
362+ }
363+ float timediff_to_float_orig = ((float)(clock() - time_to_float_orig)) / CLOCKS_PER_SEC;
364+ // output performance results
365+ printf(
366+ "Integer->JackFloat @%7.7s/%u: Orig %7.6f sec / Accel %7.6f sec -> Win: %5.2f %%\n",
367+ test_cases[testcase].name,
368+ channels,
369+ timediff_to_float_orig,
370+ timediff_to_float_accel,
371+ (timediff_to_float_orig/timediff_to_float_accel-1)*100.0);
372+ jack_default_audio_sample_t float_deviation_max = 0.0;
373+ uint32_t float_error_count = 0;
374+ // output error (avoid spam -> limit error lines per test case)
375+ for(uint32_t sample=0; sample<TESTBUFF_SIZE; sample++) {
376+ // For easier estimation/readabilty we scale floats back to integer
377+ jack_default_audio_sample_t sample_scaling;
378+ switch(test_cases[testcase].sample_size) {
379+ case 2:
380+ sample_scaling = SAMPLE_16BIT_SCALING;
381+ break;
382+ default:
383+ sample_scaling = SAMPLE_24BIT_SCALING;
384+ break;
385+ }
386+ jack_default_audio_sample_t floatval_accel = jackfloatbuffer_accel[sample] * sample_scaling;
387+ jack_default_audio_sample_t floatval_orig = jackfloatbuffer_orig[sample] * sample_scaling;
388+ // compare both results
389+ jack_default_audio_sample_t float_deviation;
390+ if(floatval_accel > floatval_orig)
391+ float_deviation = floatval_accel-floatval_orig;
392+ else
393+ float_deviation = floatval_orig-floatval_accel;
394+ if(float_deviation > float_deviation_max)
395+ float_deviation_max = float_deviation;
396+ // deviation > half bit => error
397+ if(float_deviation > 0.5) {
398+ if(float_error_count<maxerr_displayed) {
399+ printf("Value error sample %u:", sample);
400+ printf(" Orig %8.1f Accel %8.1f\n", floatval_orig, floatval_accel);
401+ }
402+ float_error_count++;
403+ }
404+ }
405+ printf(
406+ "Integer->JackFloat @%7.7s/%u: Errors: %u Max deviation %f\n",
407+ test_cases[testcase].name,
408+ channels,
409+ float_error_count,
410+ float_deviation_max);
411+
412+ printf("\n");
413+ }
414+ }
415+ return 0;
416+}
417diff --git a/example-clients/wscript b/example-clients/wscript
418index ba67614..1b2f674 100644
419--- a/example-clients/wscript
420+++ b/example-clients/wscript
421@@ -28,7 +28,8 @@ example_programs = {
422 'jack_net_master' : 'netmaster.c',
423 'jack_latent_client' : 'latent_client.c',
424 'jack_midi_dump' : 'midi_dump.c',
425- 'jack_midi_latency_test' : 'midi_latency_test.c'
426+ 'jack_midi_latency_test' : 'midi_latency_test.c',
427+ 'jack_simdtests' : 'simdtests.cpp'
428 }
429
430 example_libs = {
431--
4322.5.5
433
diff --git a/meta-oe/recipes-multimedia/jack/jack_git.bb b/meta-oe/recipes-multimedia/jack/jack_git.bb
index f0e91eba3..ba52691d3 100644
--- a/meta-oe/recipes-multimedia/jack/jack_git.bb
+++ b/meta-oe/recipes-multimedia/jack/jack_git.bb
@@ -14,12 +14,8 @@ LIC_FILES_CHKSUM = " \
14 14
15DEPENDS = "libsamplerate0 libsndfile1 readline" 15DEPENDS = "libsamplerate0 libsndfile1 readline"
16 16
17SRC_URI = " \ 17SRC_URI = "git://github.com/jackaudio/jack2.git"
18 git://github.com/jackaudio/jack2.git \ 18SRCREV = "2d1d323505585d406a7e64fb932953baefc5945e"
19 file://0001-Add-ARM-NEON-acceleration-for-all-non-dithering-samp.patch \
20 file://0002-jack_simdtests-add-application-checking-accurracy-an.patch \
21"
22SRCREV = "0279a2d65a36d1378f5bab56d95bf9e99cc8cefb"
23PV = "1.9.10+git${SRCPV}" 19PV = "1.9.10+git${SRCPV}"
24S = "${WORKDIR}/git" 20S = "${WORKDIR}/git"
25 21