summaryrefslogtreecommitdiffstats
path: root/meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch')
-rw-r--r--meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch769
1 files changed, 769 insertions, 0 deletions
diff --git a/meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch b/meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch
new file mode 100644
index 0000000000..323ac7da83
--- /dev/null
+++ b/meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch
@@ -0,0 +1,769 @@
1From 0880595f9b08d15da0e72cefaf24841cbb930883 Mon Sep 17 00:00:00 2001
2From: Gyorgy Sarvari <skandigraun@gmail.com>
3Date: Sat, 7 Jun 2025 14:10:40 +0200
4Subject: [PATCH] add missing files
5
6Due to a release issue, two files were not added to the libtheora 1.2.0
7release tarball - these files are required to be able to build the
8library for 32-bit ARM systems along with assembly optimization.
9
10This patch adds these files.
11
12This is not a code issue per-se, rather a tarballing one, as the files
13are present in the source code repository.
14
15Upstream-Status: Backport [https://gitlab.xiph.org/xiph/theora/-/issues/2338]
16
17Signed-off-by: Gyorgy Sarvari <skandigraun@gmail.com>
18---
19 lib/arm/armenc.c | 57 ++++
20 lib/arm/armloop.s | 676 ++++++++++++++++++++++++++++++++++++++++++++++
21 2 files changed, 733 insertions(+)
22 create mode 100644 lib/arm/armenc.c
23 create mode 100644 lib/arm/armloop.s
24
25diff --git a/lib/arm/armenc.c b/lib/arm/armenc.c
26new file mode 100644
27index 0000000..4cfb8a7
28--- /dev/null
29+++ b/lib/arm/armenc.c
30@@ -0,0 +1,57 @@
31+/********************************************************************
32+ * *
33+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
34+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
35+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
36+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
37+ * *
38+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
39+ * by the Xiph.Org Foundation and contributors *
40+ * https://www.xiph.org/ *
41+ * *
42+ ********************************************************************
43+
44+ function:
45+
46+ ********************************************************************/
47+#include "armenc.h"
48+
49+#if defined(OC_ARM_ASM)
50+
51+void oc_enc_accel_init_arm(oc_enc_ctx *_enc){
52+ ogg_uint32_t cpu_flags;
53+ cpu_flags=_enc->state.cpu_flags;
54+ oc_enc_accel_init_c(_enc);
55+# if defined(OC_ENC_USE_VTABLE)
56+ /*TODO: Add ARMv4 functions here.*/
57+# endif
58+# if defined(OC_ARM_ASM_EDSP)
59+ if(cpu_flags&OC_CPU_ARM_EDSP){
60+# if defined(OC_STATE_USE_VTABLE)
61+ /*TODO: Add EDSP functions here.*/
62+# endif
63+ }
64+# if defined(OC_ARM_ASM_MEDIA)
65+ if(cpu_flags&OC_CPU_ARM_MEDIA){
66+# if defined(OC_STATE_USE_VTABLE)
67+ /*TODO: Add Media functions here.*/
68+# endif
69+ }
70+# if defined(OC_ARM_ASM_NEON)
71+ if(cpu_flags&OC_CPU_ARM_NEON){
72+# if defined(OC_STATE_USE_VTABLE)
73+ _enc->opt_vtable.frag_satd=oc_enc_frag_satd_neon;
74+ _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_neon;
75+ _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_neon;
76+ _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_neon;
77+ _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_neon;
78+ _enc->opt_vtable.quantize=oc_enc_quantize_neon;
79+# endif
80+ _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
81+ _enc->opt_data.enquant_table_alignment=16;
82+ }
83+# endif
84+# endif
85+# endif
86+}
87+#endif
88diff --git a/lib/arm/armloop.s b/lib/arm/armloop.s
89new file mode 100644
90index 0000000..c35da0f
91--- /dev/null
92+++ b/lib/arm/armloop.s
93@@ -0,0 +1,676 @@
94+;********************************************************************
95+;* *
96+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
97+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
98+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
99+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
100+;* *
101+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
102+;* by the Xiph.Org Foundation and contributors *
103+;* https://www.xiph.org/ *
104+;* *
105+;********************************************************************
106+; Original implementation:
107+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
108+;********************************************************************
109+
110+ AREA |.text|, CODE, READONLY
111+
112+ GET armopts.s
113+
114+ EXPORT oc_loop_filter_frag_rows_arm
115+
116+; Which bit this is depends on the order of packing within a bitfield.
117+; Hopefully that doesn't change among any of the relevant compilers.
118+OC_FRAG_CODED_FLAG * 1
119+
120+ ; Vanilla ARM v4 version
121+loop_filter_h_arm PROC
122+ ; r0 = unsigned char *_pix
123+ ; r1 = int _ystride
124+ ; r2 = int *_bv
125+ ; preserves r0-r3
126+ STMFD r13!,{r3-r6,r14}
127+ MOV r14,#8
128+ MOV r6, #255
129+lfh_arm_lp
130+ LDRB r3, [r0, #-2] ; r3 = _pix[0]
131+ LDRB r12,[r0, #1] ; r12= _pix[3]
132+ LDRB r4, [r0, #-1] ; r4 = _pix[1]
133+ LDRB r5, [r0] ; r5 = _pix[2]
134+ SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
135+ ADD r3, r3, #4
136+ SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
137+ ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
138+ ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
139+ MOV r12,r12,ASR #3
140+ LDRSB r12,[r2, r12]
141+ ; Stall (2 on Xscale)
142+ ADDS r4, r4, r12
143+ CMPGT r6, r4
144+ EORLT r4, r6, r4, ASR #32
145+ SUBS r5, r5, r12
146+ CMPGT r6, r5
147+ EORLT r5, r6, r5, ASR #32
148+ STRB r4, [r0, #-1]
149+ STRB r5, [r0], r1
150+ SUBS r14,r14,#1
151+ BGT lfh_arm_lp
152+ SUB r0, r0, r1, LSL #3
153+ LDMFD r13!,{r3-r6,PC}
154+ ENDP
155+
156+loop_filter_v_arm PROC
157+ ; r0 = unsigned char *_pix
158+ ; r1 = int _ystride
159+ ; r2 = int *_bv
160+ ; preserves r0-r3
161+ STMFD r13!,{r3-r6,r14}
162+ MOV r14,#8
163+ MOV r6, #255
164+lfv_arm_lp
165+ LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0]
166+ LDRB r12,[r0, r1] ; r12= _pix[3]
167+ LDRB r4, [r0, -r1] ; r4 = _pix[1]
168+ LDRB r5, [r0] ; r5 = _pix[2]
169+ SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
170+ ADD r3, r3, #4
171+ SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
172+ ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
173+ ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
174+ MOV r12,r12,ASR #3
175+ LDRSB r12,[r2, r12]
176+ ; Stall (2 on Xscale)
177+ ADDS r4, r4, r12
178+ CMPGT r6, r4
179+ EORLT r4, r6, r4, ASR #32
180+ SUBS r5, r5, r12
181+ CMPGT r6, r5
182+ EORLT r5, r6, r5, ASR #32
183+ STRB r4, [r0, -r1]
184+ STRB r5, [r0], #1
185+ SUBS r14,r14,#1
186+ BGT lfv_arm_lp
187+ SUB r0, r0, #8
188+ LDMFD r13!,{r3-r6,PC}
189+ ENDP
190+
191+oc_loop_filter_frag_rows_arm PROC
192+ ; r0 = _ref_frame_data
193+ ; r1 = _ystride
194+ ; r2 = _bv
195+ ; r3 = _frags
196+ ; r4 = _fragi0
197+ ; r5 = _fragi0_end
198+ ; r6 = _fragi_top
199+ ; r7 = _fragi_bot
200+ ; r8 = _frag_buf_offs
201+ ; r9 = _nhfrags
202+ MOV r12,r13
203+ STMFD r13!,{r0,r4-r11,r14}
204+ LDMFD r12,{r4-r9}
205+ ADD r2, r2, #127 ; _bv += 127
206+ CMP r4, r5 ; if(_fragi0>=_fragi0_end)
207+ BGE oslffri_arm_end ; bail
208+ SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
209+ BLE oslffri_arm_end ; bail
210+ ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
211+ ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
212+ SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
213+oslffri_arm_lp1
214+ MOV r10,r4 ; r10= fragi = _fragi0
215+ ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
216+oslffri_arm_lp2
217+ LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
218+ LDR r0, [r13] ; r0 = _ref_frame_data
219+ LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
220+ TST r14,#OC_FRAG_CODED_FLAG
221+ BEQ oslffri_arm_uncoded
222+ CMP r10,r4 ; if (fragi>_fragi0)
223+ ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
224+ BLGT loop_filter_h_arm
225+ CMP r4, r6 ; if (_fragi0>_fragi_top)
226+ BLGT loop_filter_v_arm
227+ CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
228+ LDRLT r12,[r3] ; r12 = _frags[fragi+1]
229+ ADD r0, r0, #8
230+ ADD r10,r10,#1 ; r10 = fragi+1;
231+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
232+ CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
233+ BLLT loop_filter_h_arm
234+ CMP r10,r7 ; if (fragi<_fragi_bot)
235+ LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
236+ SUB r0, r0, #8
237+ ADD r0, r0, r1, LSL #3
238+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
239+ CMPLT r12,#OC_FRAG_CODED_FLAG
240+ BLLT loop_filter_v_arm
241+ CMP r10,r11 ; while(fragi<=fragi_end-1)
242+ BLE oslffri_arm_lp2
243+ MOV r4, r10 ; r4 = fragi0 += _nhfrags
244+ CMP r4, r5
245+ BLT oslffri_arm_lp1
246+oslffri_arm_end
247+ LDMFD r13!,{r0,r4-r11,PC}
248+oslffri_arm_uncoded
249+ ADD r10,r10,#1
250+ CMP r10,r11
251+ BLE oslffri_arm_lp2
252+ MOV r4, r10 ; r4 = _fragi0 += _nhfrags
253+ CMP r4, r5
254+ BLT oslffri_arm_lp1
255+ LDMFD r13!,{r0,r4-r11,PC}
256+ ENDP
257+
258+ [ OC_ARM_ASM_MEDIA
259+ EXPORT oc_loop_filter_init_v6
260+ EXPORT oc_loop_filter_frag_rows_v6
261+
262+oc_loop_filter_init_v6 PROC
263+ ; r0 = _bv
264+ ; r1 = _flimit (=L from the spec)
265+ MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L>
266+ AND r1, r1, #255 ; r1 = ll=r1&0xFF
267+ ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll>
268+ PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll>
269+ STR r1, [r0]
270+ MOV PC,r14
271+ ENDP
272+
273+; We could use the same strategy as the v filter below, but that would require
274+; 40 instructions to load the data and transpose it into columns and another
275+; 32 to write out the results at the end, plus the 52 instructions to do the
276+; filtering itself.
277+; This is slightly less, and less code, even assuming we could have shared the
278+; 52 instructions in the middle with the other function.
279+; It executes slightly fewer instructions than the ARMv6 approach David Conrad
280+; proposed for FFmpeg, but not by much:
281+; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
282+; His is a lot less code, though, because it only does two rows at once instead
283+; of four.
284+loop_filter_h_v6 PROC
285+ ; r0 = unsigned char *_pix
286+ ; r1 = int _ystride
287+ ; r2 = int _ll
288+ ; preserves r0-r3
289+ STMFD r13!,{r4-r11,r14}
290+ LDR r12,=0x10003
291+ BL loop_filter_h_core_v6
292+ ADD r0, r0, r1, LSL #2
293+ BL loop_filter_h_core_v6
294+ SUB r0, r0, r1, LSL #2
295+ LDMFD r13!,{r4-r11,PC}
296+ ENDP
297+
298+loop_filter_h_core_v6 PROC
299+ ; r0 = unsigned char *_pix
300+ ; r1 = int _ystride
301+ ; r2 = int _ll
302+ ; r12= 0x10003
303+ ; Preserves r0-r3, r12; Clobbers r4-r11.
304+ LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0>
305+ ; Single issue
306+ LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0>
307+ UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2>
308+ UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1>
309+ UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2>
310+ UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1>
311+ PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1>
312+ PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2>
313+ SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2>
314+ SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3>
315+ SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2>
316+ SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4>
317+ LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0>
318+ MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
319+ LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0>
320+ PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p>
321+ UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2>
322+ UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p>
323+ UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1>
324+ UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2>
325+ PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2>
326+ SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2>
327+ UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1>
328+ SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3>
329+ SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2>
330+ SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4>
331+ ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2>
332+ MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
333+ PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1>
334+ PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r>
335+ ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1>
336+ UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r>
337+ MOV r10,#0
338+ ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p>
339+ ; Single issue
340+ ; There's no min, max or abs instruction.
341+ ; SSUB8 and SEL will work for abs, and we can do all the rest with
342+ ; unsigned saturated adds, which means the GE flags are still all
343+ ; set when we're done computing lflim(abs(R_i),L).
344+ ; This allows us to both add and subtract, and split the results by
345+ ; the original sign of R_i.
346+ SSUB8 r7, r10,r6
347+ ; Single issue
348+ SEL r7, r7, r6 ; r7 = abs(R_i)
349+ ; Single issue
350+ UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0)
351+ ; Single issue
352+ UQADD8 r7, r7, r4
353+ ; Single issue
354+ UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
355+ ; Single issue
356+ UQSUB8 r4, r8, r7
357+ UQADD8 r5, r9, r7
358+ UQADD8 r8, r8, r7
359+ UQSUB8 r9, r9, r7
360+ SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L)
361+ SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L)
362+ MOV r5, r9, LSR #24 ; r5 = s2
363+ STRB r5, [r0,#2]!
364+ MOV r4, r8, LSR #24 ; r4 = s1
365+ STRB r4, [r0,#-1]
366+ MOV r5, r9, LSR #8 ; r5 = r2
367+ STRB r5, [r0,-r1]!
368+ MOV r4, r8, LSR #8 ; r4 = r1
369+ STRB r4, [r0,#-1]
370+ MOV r5, r9, LSR #16 ; r5 = q2
371+ STRB r5, [r0,-r1]!
372+ MOV r4, r8, LSR #16 ; r4 = q1
373+ STRB r4, [r0,#-1]
374+ ; Single issue
375+ STRB r9, [r0,-r1]!
376+ ; Single issue
377+ STRB r8, [r0,#-1]
378+ MOV PC,r14
379+ ENDP
380+
381+; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
382+; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
383+; This works just as well, with the following procedure for computing the
384+; filter value, f:
385+; u = ~UHADD8(p1,~p2);
386+; v = UHADD8(~p1,p2);
387+; m = v-u;
388+; a = m^UHADD8(m^p0,m^~p3);
389+; f = UHADD8(UHADD8(a,u1),v1);
390+; where f = 127+R, with R in [-127,128] defined as in the spec.
391+; This is exactly the same amount of arithmetic as the version that uses PAVGB
392+; as the basic operator.
393+; It executes about 2/3 the number of instructions of David Conrad's approach,
394+; but requires more code, because it does all eight columns at once, instead
395+; of four at a time.
396+loop_filter_v_v6 PROC
397+ ; r0 = unsigned char *_pix
398+ ; r1 = int _ystride
399+ ; r2 = int _ll
400+ ; preserves r0-r11
401+ STMFD r13!,{r4-r11,r14}
402+ LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1>
403+ LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0>
404+ LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2>
405+ MVN r14,r6 ; r14= ~p1
406+ LDRD r10,[r0, r1] ; r11,r10= <p7|p3>
407+ ; Filter the first four columns.
408+ MVN r12,r8 ; r12= ~p2
409+ UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1
410+ UHADD8 r12,r12,r6 ; r12= p1+~p2>>1
411+ MVN r10, r10 ; r10=~p3
412+ MVN r12,r12 ; r12= u1=~p1+p2+1>>1
413+ SSUB8 r14,r14,r12 ; r14= m1=v1-u1
414+ ; Single issue
415+ EOR r4, r4, r14 ; r4 = m1^p0
416+ EOR r10,r10,r14 ; r10= m1^~p3
417+ UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1
418+ ; Single issue
419+ EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
420+ SADD8 r14,r14,r12 ; r14= v1=m1+u1
421+ UHADD8 r4, r4, r12 ; r4 = a1+u1>>1
422+ MVN r12,r9 ; r12= ~p6
423+ UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1
424+ ; Filter the second four columns.
425+ MVN r14,r7 ; r14= ~p5
426+ UHADD8 r12,r12,r7 ; r12= p5+~p6>>1
427+ UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1
428+ MVN r12,r12 ; r12= u2=~p5+p6+1>>1
429+ MVN r11,r11 ; r11=~p7
430+ SSUB8 r10,r14,r12 ; r10= m2=v2-u2
431+ ; Single issue
432+ EOR r5, r5, r10 ; r5 = m2^p4
433+ EOR r11,r11,r10 ; r11= m2^~p7
434+ UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1
435+ ; Single issue
436+ EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
437+ ; Single issue
438+ UHADD8 r5, r5, r12 ; r5 = a2+u2>>1
439+ LDR r12,=0x7F7F7F7F ; r12 = {127}x4
440+ UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1
441+ ; Now split f[i] by sign.
442+ ; There's no min or max instruction.
443+ ; We could use SSUB8 and SEL, but this is just as many instructions and
444+ ; dual issues more (for v7 without NEON).
445+ UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0
446+ UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0
447+ UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0)
448+ UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
449+ UQADD8 r10,r10,r11
450+ UQADD8 r4, r4, r14
451+ UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
452+ UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
453+ UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0
454+ UQADD8 r6, r6, r10
455+ UQSUB8 r8, r8, r10
456+ UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0
457+ UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L)
458+ UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L)
459+ UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0)
460+ UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
461+ UQADD8 r11,r11,r10
462+ UQADD8 r5, r5, r14
463+ UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
464+ UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
465+ UQADD8 r7, r7, r11
466+ UQSUB8 r9, r9, r11
467+ UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L)
468+ STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6]
469+ UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L)
470+ STRD r8, [r0] ; [p6:p2] = [r9: r8]
471+ LDMFD r13!,{r4-r11,PC}
472+ ENDP
473+
474+oc_loop_filter_frag_rows_v6 PROC
475+ ; r0 = _ref_frame_data
476+ ; r1 = _ystride
477+ ; r2 = _bv
478+ ; r3 = _frags
479+ ; r4 = _fragi0
480+ ; r5 = _fragi0_end
481+ ; r6 = _fragi_top
482+ ; r7 = _fragi_bot
483+ ; r8 = _frag_buf_offs
484+ ; r9 = _nhfrags
485+ MOV r12,r13
486+ STMFD r13!,{r0,r4-r11,r14}
487+ LDMFD r12,{r4-r9}
488+ LDR r2, [r2] ; ll = *(int *)_bv
489+ CMP r4, r5 ; if(_fragi0>=_fragi0_end)
490+ BGE oslffri_v6_end ; bail
491+ SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
492+ BLE oslffri_v6_end ; bail
493+ ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
494+ ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
495+ SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
496+oslffri_v6_lp1
497+ MOV r10,r4 ; r10= fragi = _fragi0
498+ ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
499+oslffri_v6_lp2
500+ LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
501+ LDR r0, [r13] ; r0 = _ref_frame_data
502+ LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
503+ TST r14,#OC_FRAG_CODED_FLAG
504+ BEQ oslffri_v6_uncoded
505+ CMP r10,r4 ; if (fragi>_fragi0)
506+ ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
507+ BLGT loop_filter_h_v6
508+ CMP r4, r6 ; if (fragi0>_fragi_top)
509+ BLGT loop_filter_v_v6
510+ CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
511+ LDRLT r12,[r3] ; r12 = _frags[fragi+1]
512+ ADD r0, r0, #8
513+ ADD r10,r10,#1 ; r10 = fragi+1;
514+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
515+ CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
516+ BLLT loop_filter_h_v6
517+ CMP r10,r7 ; if (fragi<_fragi_bot)
518+ LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
519+ SUB r0, r0, #8
520+ ADD r0, r0, r1, LSL #3
521+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
522+ CMPLT r12,#OC_FRAG_CODED_FLAG
523+ BLLT loop_filter_v_v6
524+ CMP r10,r11 ; while(fragi<=fragi_end-1)
525+ BLE oslffri_v6_lp2
526+ MOV r4, r10 ; r4 = fragi0 += nhfrags
527+ CMP r4, r5
528+ BLT oslffri_v6_lp1
529+oslffri_v6_end
530+ LDMFD r13!,{r0,r4-r11,PC}
531+oslffri_v6_uncoded
532+ ADD r10,r10,#1
533+ CMP r10,r11
534+ BLE oslffri_v6_lp2
535+ MOV r4, r10 ; r4 = fragi0 += nhfrags
536+ CMP r4, r5
537+ BLT oslffri_v6_lp1
538+ LDMFD r13!,{r0,r4-r11,PC}
539+ ENDP
540+ ]
541+
542+ [ OC_ARM_ASM_NEON
543+ EXPORT oc_loop_filter_init_neon
544+ EXPORT oc_loop_filter_frag_rows_neon
545+
546+oc_loop_filter_init_neon PROC
547+ ; r0 = _bv
548+ ; r1 = _flimit (=L from the spec)
549+ MOV r1, r1, LSL #1 ; r1 = 2*L
550+ VDUP.S16 Q15, r1 ; Q15= 2L in U16s
551+ VST1.64 {D30,D31}, [r0@128]
552+ MOV PC,r14
553+ ENDP
554+
555+loop_filter_h_neon PROC
556+ ; r0 = unsigned char *_pix
557+ ; r1 = int _ystride
558+ ; r2 = int *_bv
559+ ; preserves r0-r3
560+ ; We assume Q15= 2*L in U16s
561+ ; My best guesses at cycle counts (and latency)--vvv
562+ SUB r12,r0, #2
563+ ; Doing a 2-element structure load saves doing two VTRN's below, at the
564+ ; cost of using two more slower single-lane loads vs. the faster
565+ ; all-lane loads.
566+ ; It's less code this way, though, and benches a hair faster, but it
567+ ; leaves D2 and D4 swapped.
568+ VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1
569+ ; D2 = ____________3322
570+ VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1
571+ ; D6 = ____________7766
572+ VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1
573+ ; D2 = ________BBAA3322
574+ VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1
575+ ; D6 = ________FFEE7766
576+ VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1
577+ ; D2 = ____JJIIBBAA3322
578+ VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1
579+ ; D6 = ____NNMMFFEE7766
580+ VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1
581+ ; D2 = RRQQJJIIBBAA3322
582+ VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1
583+ ; D6 = VVUUNNMMFFEE7766
584+ VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1
585+ VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1
586+ VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
587+ VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3
588+ ADD r12,r0, #8
589+ VADD.S16 Q0, Q0, Q8 ; 1,3
590+ PLD [r12]
591+ VADD.S16 Q0, Q0, Q8 ; 1,3
592+ PLD [r12,r1]
593+ VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
594+ PLD [r12,r1, LSL #1]
595+ VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
596+ ADD r12,r12,r1, LSL #2
597+ ; We want to do
598+ ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
599+ ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
600+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
601+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
602+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
603+ ; So we've reduced the left and right hand terms to be the same, except
604+ ; for a negation.
605+ ; Stall x3
606+ VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
607+ PLD [r12,-r1]
608+ VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
609+ PLD [r12]
610+ VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
611+ PLD [r12,r1]
612+ VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
613+ PLD [r12,r1,LSL #1]
614+ VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
615+ ADD r12,r12,r1, LSL #2
616+ ; Now we need to correct for the sign of f.
617+ ; For negative elements of Q0, we want to subtract the appropriate
618+ ; element of Q9. For positive elements we want to add them. No NEON
619+ ; instruction exists to do this, so we need to negate the negative
620+ ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
621+ VADD.S16 Q9, Q9, Q0 ; 1,3
622+ PLD [r12,-r1]
623+ VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
624+ ; Bah. No VRSBW.U8
625+ ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
626+ VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
627+ VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
628+ VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1
629+ VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1
630+ SUB r12,r0, #1
631+ VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1
632+ VST1.16 {D4[0]}, [r12], r1
633+ VST1.16 {D2[0]}, [r12], r1
634+ VST1.16 {D4[1]}, [r12], r1
635+ VST1.16 {D2[1]}, [r12], r1
636+ VST1.16 {D4[2]}, [r12], r1
637+ VST1.16 {D2[2]}, [r12], r1
638+ VST1.16 {D4[3]}, [r12], r1
639+ VST1.16 {D2[3]}, [r12], r1
640+ MOV PC,r14
641+ ENDP
642+
643+loop_filter_v_neon PROC
644+ ; r0 = unsigned char *_pix
645+ ; r1 = int _ystride
646+ ; r2 = int *_bv
647+ ; preserves r0-r3
648+ ; We assume Q15= 2*L in U16s
649+ ; My best guesses at cycle counts (and latency)--vvv
650+ SUB r12,r0, r1, LSL #1
651+ VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1
652+ VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1
653+ VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1
654+ VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1
655+ VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3
656+ VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
657+ ADD r12, #8
658+ VADD.S16 Q0, Q0, Q8 ; 1,3
659+ PLD [r12]
660+ VADD.S16 Q0, Q0, Q8 ; 1,3
661+ PLD [r12,r1]
662+ VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
663+ SUB r12, r0, r1
664+ VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
665+ ; We want to do
666+ ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
667+ ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
668+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
669+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
670+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
671+ ; So we've reduced the left and right hand terms to be the same, except
672+ ; for a negation.
673+ ; Stall x3
674+ VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
675+ VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
676+ ; Stall x2
677+ VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
678+ VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
679+ ; Stall x2
680+ VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
681+ ; Now we need to correct for the sign of f.
682+ ; For negative elements of Q0, we want to subtract the appropriate
683+ ; element of Q9. For positive elements we want to add them. No NEON
684+ ; instruction exists to do this, so we need to negate the negative
685+ ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
686+ ; Stall x3
687+ VADD.S16 Q9, Q9, Q0 ; 1,3
688+ ; Stall x2
689+ VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
690+ ; Bah. No VRSBW.U8
691+ ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
692+ VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
693+ VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
694+ VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1
695+ VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1
696+ VST1.64 {D2}, [r12@64], r1
697+ VST1.64 {D4}, [r12@64], r1
698+ MOV PC,r14
699+ ENDP
700+
701+oc_loop_filter_frag_rows_neon PROC
702+ ; r0 = _ref_frame_data
703+ ; r1 = _ystride
704+ ; r2 = _bv
705+ ; r3 = _frags
706+ ; r4 = _fragi0
707+ ; r5 = _fragi0_end
708+ ; r6 = _fragi_top
709+ ; r7 = _fragi_bot
710+ ; r8 = _frag_buf_offs
711+ ; r9 = _nhfrags
712+ MOV r12,r13
713+ STMFD r13!,{r0,r4-r11,r14}
714+ LDMFD r12,{r4-r9}
715+ CMP r4, r5 ; if(_fragi0>=_fragi0_end)
716+ BGE oslffri_neon_end; bail
717+ SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
718+ BLE oslffri_neon_end ; bail
719+ VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s
720+ ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
721+ ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
722+ SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
723+oslffri_neon_lp1
724+ MOV r10,r4 ; r10= fragi = _fragi0
725+ ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
726+oslffri_neon_lp2
727+ LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
728+ LDR r0, [r13] ; r0 = _ref_frame_data
729+ LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
730+ TST r14,#OC_FRAG_CODED_FLAG
731+ BEQ oslffri_neon_uncoded
732+ CMP r10,r4 ; if (fragi>_fragi0)
733+ ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
734+ BLGT loop_filter_h_neon
735+ CMP r4, r6 ; if (_fragi0>_fragi_top)
736+ BLGT loop_filter_v_neon
737+ CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
738+ LDRLT r12,[r3] ; r12 = _frags[fragi+1]
739+ ADD r0, r0, #8
740+ ADD r10,r10,#1 ; r10 = fragi+1;
741+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
742+ CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
743+ BLLT loop_filter_h_neon
744+ CMP r10,r7 ; if (fragi<_fragi_bot)
745+ LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
746+ SUB r0, r0, #8
747+ ADD r0, r0, r1, LSL #3
748+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
749+ CMPLT r12,#OC_FRAG_CODED_FLAG
750+ BLLT loop_filter_v_neon
751+ CMP r10,r11 ; while(fragi<=fragi_end-1)
752+ BLE oslffri_neon_lp2
753+ MOV r4, r10 ; r4 = _fragi0 += _nhfrags
754+ CMP r4, r5
755+ BLT oslffri_neon_lp1
756+oslffri_neon_end
757+ LDMFD r13!,{r0,r4-r11,PC}
758+oslffri_neon_uncoded
759+ ADD r10,r10,#1
760+ CMP r10,r11
761+ BLE oslffri_neon_lp2
762+ MOV r4, r10 ; r4 = _fragi0 += _nhfrags
763+ CMP r4, r5
764+ BLT oslffri_neon_lp1
765+ LDMFD r13!,{r0,r4-r11,PC}
766+ ENDP
767+ ]
768+
769+ END