diff options
Diffstat (limited to 'meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch')
-rw-r--r-- | meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch | 769 |
1 files changed, 769 insertions, 0 deletions
diff --git a/meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch b/meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch new file mode 100644 index 0000000000..323ac7da83 --- /dev/null +++ b/meta/recipes-multimedia/libtheora/libtheora/0001-add-missing-files.patch | |||
@@ -0,0 +1,769 @@ | |||
1 | From 0880595f9b08d15da0e72cefaf24841cbb930883 Mon Sep 17 00:00:00 2001 | ||
2 | From: Gyorgy Sarvari <skandigraun@gmail.com> | ||
3 | Date: Sat, 7 Jun 2025 14:10:40 +0200 | ||
4 | Subject: [PATCH] add missing files | ||
5 | |||
6 | Due to a release issue, two files were not added to the libtheora 1.2.0 | ||
7 | release tarball - these files are required to be able to build the | ||
8 | library for 32-bit ARM systems along with assembly optimization. | ||
9 | |||
10 | This patch adds these files. | ||
11 | |||
12 | This is not a code issue per-se, rather a tarballing one, as the files | ||
13 | are present in the source code repository. | ||
14 | |||
15 | Upstream-Status: Backport [https://gitlab.xiph.org/xiph/theora/-/issues/2338] | ||
16 | |||
17 | Signed-off-by: Gyorgy Sarvari <skandigraun@gmail.com> | ||
18 | --- | ||
19 | lib/arm/armenc.c | 57 ++++ | ||
20 | lib/arm/armloop.s | 676 ++++++++++++++++++++++++++++++++++++++++++++++ | ||
21 | 2 files changed, 733 insertions(+) | ||
22 | create mode 100644 lib/arm/armenc.c | ||
23 | create mode 100644 lib/arm/armloop.s | ||
24 | |||
25 | diff --git a/lib/arm/armenc.c b/lib/arm/armenc.c | ||
26 | new file mode 100644 | ||
27 | index 0000000..4cfb8a7 | ||
28 | --- /dev/null | ||
29 | +++ b/lib/arm/armenc.c | ||
30 | @@ -0,0 +1,57 @@ | ||
31 | +/******************************************************************** | ||
32 | + * * | ||
33 | + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * | ||
34 | + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * | ||
35 | + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * | ||
36 | + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * | ||
37 | + * * | ||
38 | + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * | ||
39 | + * by the Xiph.Org Foundation and contributors * | ||
40 | + * https://www.xiph.org/ * | ||
41 | + * * | ||
42 | + ******************************************************************** | ||
43 | + | ||
44 | + function: | ||
45 | + | ||
46 | + ********************************************************************/ | ||
47 | +#include "armenc.h" | ||
48 | + | ||
49 | +#if defined(OC_ARM_ASM) | ||
50 | + | ||
51 | +void oc_enc_accel_init_arm(oc_enc_ctx *_enc){ | ||
52 | + ogg_uint32_t cpu_flags; | ||
53 | + cpu_flags=_enc->state.cpu_flags; | ||
54 | + oc_enc_accel_init_c(_enc); | ||
55 | +# if defined(OC_ENC_USE_VTABLE) | ||
56 | + /*TODO: Add ARMv4 functions here.*/ | ||
57 | +# endif | ||
58 | +# if defined(OC_ARM_ASM_EDSP) | ||
59 | + if(cpu_flags&OC_CPU_ARM_EDSP){ | ||
60 | +# if defined(OC_STATE_USE_VTABLE) | ||
61 | + /*TODO: Add EDSP functions here.*/ | ||
62 | +# endif | ||
63 | + } | ||
64 | +# if defined(OC_ARM_ASM_MEDIA) | ||
65 | + if(cpu_flags&OC_CPU_ARM_MEDIA){ | ||
66 | +# if defined(OC_STATE_USE_VTABLE) | ||
67 | + /*TODO: Add Media functions here.*/ | ||
68 | +# endif | ||
69 | + } | ||
70 | +# if defined(OC_ARM_ASM_NEON) | ||
71 | + if(cpu_flags&OC_CPU_ARM_NEON){ | ||
72 | +# if defined(OC_STATE_USE_VTABLE) | ||
73 | + _enc->opt_vtable.frag_satd=oc_enc_frag_satd_neon; | ||
74 | + _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_neon; | ||
75 | + _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_neon; | ||
76 | + _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_neon; | ||
77 | + _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_neon; | ||
78 | + _enc->opt_vtable.quantize=oc_enc_quantize_neon; | ||
79 | +# endif | ||
80 | + _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t); | ||
81 | + _enc->opt_data.enquant_table_alignment=16; | ||
82 | + } | ||
83 | +# endif | ||
84 | +# endif | ||
85 | +# endif | ||
86 | +} | ||
87 | +#endif | ||
88 | diff --git a/lib/arm/armloop.s b/lib/arm/armloop.s | ||
89 | new file mode 100644 | ||
90 | index 0000000..c35da0f | ||
91 | --- /dev/null | ||
92 | +++ b/lib/arm/armloop.s | ||
93 | @@ -0,0 +1,676 @@ | ||
94 | +;******************************************************************** | ||
95 | +;* * | ||
96 | +;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * | ||
97 | +;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * | ||
98 | +;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * | ||
99 | +;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * | ||
100 | +;* * | ||
101 | +;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 * | ||
102 | +;* by the Xiph.Org Foundation and contributors * | ||
103 | +;* https://www.xiph.org/ * | ||
104 | +;* * | ||
105 | +;******************************************************************** | ||
106 | +; Original implementation: | ||
107 | +; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd | ||
108 | +;******************************************************************** | ||
109 | + | ||
110 | + AREA |.text|, CODE, READONLY | ||
111 | + | ||
112 | + GET armopts.s | ||
113 | + | ||
114 | + EXPORT oc_loop_filter_frag_rows_arm | ||
115 | + | ||
116 | +; Which bit this is depends on the order of packing within a bitfield. | ||
117 | +; Hopefully that doesn't change among any of the relevant compilers. | ||
118 | +OC_FRAG_CODED_FLAG * 1 | ||
119 | + | ||
120 | + ; Vanilla ARM v4 version | ||
121 | +loop_filter_h_arm PROC | ||
122 | + ; r0 = unsigned char *_pix | ||
123 | + ; r1 = int _ystride | ||
124 | + ; r2 = int *_bv | ||
125 | + ; preserves r0-r3 | ||
126 | + STMFD r13!,{r3-r6,r14} | ||
127 | + MOV r14,#8 | ||
128 | + MOV r6, #255 | ||
129 | +lfh_arm_lp | ||
130 | + LDRB r3, [r0, #-2] ; r3 = _pix[0] | ||
131 | + LDRB r12,[r0, #1] ; r12= _pix[3] | ||
132 | + LDRB r4, [r0, #-1] ; r4 = _pix[1] | ||
133 | + LDRB r5, [r0] ; r5 = _pix[2] | ||
134 | + SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 | ||
135 | + ADD r3, r3, #4 | ||
136 | + SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] | ||
137 | + ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) | ||
138 | + ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 | ||
139 | + MOV r12,r12,ASR #3 | ||
140 | + LDRSB r12,[r2, r12] | ||
141 | + ; Stall (2 on Xscale) | ||
142 | + ADDS r4, r4, r12 | ||
143 | + CMPGT r6, r4 | ||
144 | + EORLT r4, r6, r4, ASR #32 | ||
145 | + SUBS r5, r5, r12 | ||
146 | + CMPGT r6, r5 | ||
147 | + EORLT r5, r6, r5, ASR #32 | ||
148 | + STRB r4, [r0, #-1] | ||
149 | + STRB r5, [r0], r1 | ||
150 | + SUBS r14,r14,#1 | ||
151 | + BGT lfh_arm_lp | ||
152 | + SUB r0, r0, r1, LSL #3 | ||
153 | + LDMFD r13!,{r3-r6,PC} | ||
154 | + ENDP | ||
155 | + | ||
156 | +loop_filter_v_arm PROC | ||
157 | + ; r0 = unsigned char *_pix | ||
158 | + ; r1 = int _ystride | ||
159 | + ; r2 = int *_bv | ||
160 | + ; preserves r0-r3 | ||
161 | + STMFD r13!,{r3-r6,r14} | ||
162 | + MOV r14,#8 | ||
163 | + MOV r6, #255 | ||
164 | +lfv_arm_lp | ||
165 | + LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0] | ||
166 | + LDRB r12,[r0, r1] ; r12= _pix[3] | ||
167 | + LDRB r4, [r0, -r1] ; r4 = _pix[1] | ||
168 | + LDRB r5, [r0] ; r5 = _pix[2] | ||
169 | + SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4 | ||
170 | + ADD r3, r3, #4 | ||
171 | + SUB r12,r5, r4 ; r12= _pix[2]-_pix[1] | ||
172 | + ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1]) | ||
173 | + ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4 | ||
174 | + MOV r12,r12,ASR #3 | ||
175 | + LDRSB r12,[r2, r12] | ||
176 | + ; Stall (2 on Xscale) | ||
177 | + ADDS r4, r4, r12 | ||
178 | + CMPGT r6, r4 | ||
179 | + EORLT r4, r6, r4, ASR #32 | ||
180 | + SUBS r5, r5, r12 | ||
181 | + CMPGT r6, r5 | ||
182 | + EORLT r5, r6, r5, ASR #32 | ||
183 | + STRB r4, [r0, -r1] | ||
184 | + STRB r5, [r0], #1 | ||
185 | + SUBS r14,r14,#1 | ||
186 | + BGT lfv_arm_lp | ||
187 | + SUB r0, r0, #8 | ||
188 | + LDMFD r13!,{r3-r6,PC} | ||
189 | + ENDP | ||
190 | + | ||
191 | +oc_loop_filter_frag_rows_arm PROC | ||
192 | + ; r0 = _ref_frame_data | ||
193 | + ; r1 = _ystride | ||
194 | + ; r2 = _bv | ||
195 | + ; r3 = _frags | ||
196 | + ; r4 = _fragi0 | ||
197 | + ; r5 = _fragi0_end | ||
198 | + ; r6 = _fragi_top | ||
199 | + ; r7 = _fragi_bot | ||
200 | + ; r8 = _frag_buf_offs | ||
201 | + ; r9 = _nhfrags | ||
202 | + MOV r12,r13 | ||
203 | + STMFD r13!,{r0,r4-r11,r14} | ||
204 | + LDMFD r12,{r4-r9} | ||
205 | + ADD r2, r2, #127 ; _bv += 127 | ||
206 | + CMP r4, r5 ; if(_fragi0>=_fragi0_end) | ||
207 | + BGE oslffri_arm_end ; bail | ||
208 | + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) | ||
209 | + BLE oslffri_arm_end ; bail | ||
210 | + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] | ||
211 | + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] | ||
212 | + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; | ||
213 | +oslffri_arm_lp1 | ||
214 | + MOV r10,r4 ; r10= fragi = _fragi0 | ||
215 | + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 | ||
216 | +oslffri_arm_lp2 | ||
217 | + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ | ||
218 | + LDR r0, [r13] ; r0 = _ref_frame_data | ||
219 | + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ | ||
220 | + TST r14,#OC_FRAG_CODED_FLAG | ||
221 | + BEQ oslffri_arm_uncoded | ||
222 | + CMP r10,r4 ; if (fragi>_fragi0) | ||
223 | + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] | ||
224 | + BLGT loop_filter_h_arm | ||
225 | + CMP r4, r6 ; if (_fragi0>_fragi_top) | ||
226 | + BLGT loop_filter_v_arm | ||
227 | + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) | ||
228 | + LDRLT r12,[r3] ; r12 = _frags[fragi+1] | ||
229 | + ADD r0, r0, #8 | ||
230 | + ADD r10,r10,#1 ; r10 = fragi+1; | ||
231 | + ANDLT r12,r12,#OC_FRAG_CODED_FLAG | ||
232 | + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 | ||
233 | + BLLT loop_filter_h_arm | ||
234 | + CMP r10,r7 ; if (fragi<_fragi_bot) | ||
235 | + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] | ||
236 | + SUB r0, r0, #8 | ||
237 | + ADD r0, r0, r1, LSL #3 | ||
238 | + ANDLT r12,r12,#OC_FRAG_CODED_FLAG | ||
239 | + CMPLT r12,#OC_FRAG_CODED_FLAG | ||
240 | + BLLT loop_filter_v_arm | ||
241 | + CMP r10,r11 ; while(fragi<=fragi_end-1) | ||
242 | + BLE oslffri_arm_lp2 | ||
243 | + MOV r4, r10 ; r4 = fragi0 += _nhfrags | ||
244 | + CMP r4, r5 | ||
245 | + BLT oslffri_arm_lp1 | ||
246 | +oslffri_arm_end | ||
247 | + LDMFD r13!,{r0,r4-r11,PC} | ||
248 | +oslffri_arm_uncoded | ||
249 | + ADD r10,r10,#1 | ||
250 | + CMP r10,r11 | ||
251 | + BLE oslffri_arm_lp2 | ||
252 | + MOV r4, r10 ; r4 = _fragi0 += _nhfrags | ||
253 | + CMP r4, r5 | ||
254 | + BLT oslffri_arm_lp1 | ||
255 | + LDMFD r13!,{r0,r4-r11,PC} | ||
256 | + ENDP | ||
257 | + | ||
258 | + [ OC_ARM_ASM_MEDIA | ||
259 | + EXPORT oc_loop_filter_init_v6 | ||
260 | + EXPORT oc_loop_filter_frag_rows_v6 | ||
261 | + | ||
262 | +oc_loop_filter_init_v6 PROC | ||
263 | + ; r0 = _bv | ||
264 | + ; r1 = _flimit (=L from the spec) | ||
265 | + MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L> | ||
266 | + AND r1, r1, #255 ; r1 = ll=r1&0xFF | ||
267 | + ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll> | ||
268 | + PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll> | ||
269 | + STR r1, [r0] | ||
270 | + MOV PC,r14 | ||
271 | + ENDP | ||
272 | + | ||
273 | +; We could use the same strategy as the v filter below, but that would require | ||
274 | +; 40 instructions to load the data and transpose it into columns and another | ||
275 | +; 32 to write out the results at the end, plus the 52 instructions to do the | ||
276 | +; filtering itself. | ||
277 | +; This is slightly less, and less code, even assuming we could have shared the | ||
278 | +; 52 instructions in the middle with the other function. | ||
279 | +; It executes slightly fewer instructions than the ARMv6 approach David Conrad | ||
280 | +; proposed for FFmpeg, but not by much: | ||
281 | +; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html | ||
282 | +; His is a lot less code, though, because it only does two rows at once instead | ||
283 | +; of four. | ||
284 | +loop_filter_h_v6 PROC | ||
285 | + ; r0 = unsigned char *_pix | ||
286 | + ; r1 = int _ystride | ||
287 | + ; r2 = int _ll | ||
288 | + ; preserves r0-r3 | ||
289 | + STMFD r13!,{r4-r11,r14} | ||
290 | + LDR r12,=0x10003 | ||
291 | + BL loop_filter_h_core_v6 | ||
292 | + ADD r0, r0, r1, LSL #2 | ||
293 | + BL loop_filter_h_core_v6 | ||
294 | + SUB r0, r0, r1, LSL #2 | ||
295 | + LDMFD r13!,{r4-r11,PC} | ||
296 | + ENDP | ||
297 | + | ||
298 | +loop_filter_h_core_v6 PROC | ||
299 | + ; r0 = unsigned char *_pix | ||
300 | + ; r1 = int _ystride | ||
301 | + ; r2 = int _ll | ||
302 | + ; r12= 0x10003 | ||
303 | + ; Preserves r0-r3, r12; Clobbers r4-r11. | ||
304 | + LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0> | ||
305 | + ; Single issue | ||
306 | + LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0> | ||
307 | + UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2> | ||
308 | + UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1> | ||
309 | + UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2> | ||
310 | + UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1> | ||
311 | + PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1> | ||
312 | + PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2> | ||
313 | + SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2> | ||
314 | + SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3> | ||
315 | + SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2> | ||
316 | + SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4> | ||
317 | + LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0> | ||
318 | + MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3> | ||
319 | + LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0> | ||
320 | + PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p> | ||
321 | + UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2> | ||
322 | + UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p> | ||
323 | + UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1> | ||
324 | + UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2> | ||
325 | + PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2> | ||
326 | + SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2> | ||
327 | + UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1> | ||
328 | + SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3> | ||
329 | + SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2> | ||
330 | + SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4> | ||
331 | + ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2> | ||
332 | + MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3> | ||
333 | + PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1> | ||
334 | + PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r> | ||
335 | + ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1> | ||
336 | + UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r> | ||
337 | + MOV r10,#0 | ||
338 | + ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p> | ||
339 | + ; Single issue | ||
340 | + ; There's no min, max or abs instruction. | ||
341 | + ; SSUB8 and SEL will work for abs, and we can do all the rest with | ||
342 | + ; unsigned saturated adds, which means the GE flags are still all | ||
343 | + ; set when we're done computing lflim(abs(R_i),L). | ||
344 | + ; This allows us to both add and subtract, and split the results by | ||
345 | + ; the original sign of R_i. | ||
346 | + SSUB8 r7, r10,r6 | ||
347 | + ; Single issue | ||
348 | + SEL r7, r7, r6 ; r7 = abs(R_i) | ||
349 | + ; Single issue | ||
350 | + UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0) | ||
351 | + ; Single issue | ||
352 | + UQADD8 r7, r7, r4 | ||
353 | + ; Single issue | ||
354 | + UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0)) | ||
355 | + ; Single issue | ||
356 | + UQSUB8 r4, r8, r7 | ||
357 | + UQADD8 r5, r9, r7 | ||
358 | + UQADD8 r8, r8, r7 | ||
359 | + UQSUB8 r9, r9, r7 | ||
360 | + SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L) | ||
361 | + SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L) | ||
362 | + MOV r5, r9, LSR #24 ; r5 = s2 | ||
363 | + STRB r5, [r0,#2]! | ||
364 | + MOV r4, r8, LSR #24 ; r4 = s1 | ||
365 | + STRB r4, [r0,#-1] | ||
366 | + MOV r5, r9, LSR #8 ; r5 = r2 | ||
367 | + STRB r5, [r0,-r1]! | ||
368 | + MOV r4, r8, LSR #8 ; r4 = r1 | ||
369 | + STRB r4, [r0,#-1] | ||
370 | + MOV r5, r9, LSR #16 ; r5 = q2 | ||
371 | + STRB r5, [r0,-r1]! | ||
372 | + MOV r4, r8, LSR #16 ; r4 = q1 | ||
373 | + STRB r4, [r0,#-1] | ||
374 | + ; Single issue | ||
375 | + STRB r9, [r0,-r1]! | ||
376 | + ; Single issue | ||
377 | + STRB r8, [r0,#-1] | ||
378 | + MOV PC,r14 | ||
379 | + ENDP | ||
380 | + | ||
381 | +; This uses the same strategy as the MMXEXT version for x86, except that UHADD8 | ||
382 | +; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB. | ||
383 | +; This works just as well, with the following procedure for computing the | ||
384 | +; filter value, f: | ||
385 | +; u = ~UHADD8(p1,~p2); | ||
386 | +; v = UHADD8(~p1,p2); | ||
387 | +; m = v-u; | ||
388 | +; a = m^UHADD8(m^p0,m^~p3); | ||
389 | +; f = UHADD8(UHADD8(a,u1),v1); | ||
390 | +; where f = 127+R, with R in [-127,128] defined as in the spec. | ||
391 | +; This is exactly the same amount of arithmetic as the version that uses PAVGB | ||
392 | +; as the basic operator. | ||
393 | +; It executes about 2/3 the number of instructions of David Conrad's approach, | ||
394 | +; but requires more code, because it does all eight columns at once, instead | ||
395 | +; of four at a time. | ||
396 | +loop_filter_v_v6 PROC | ||
397 | + ; r0 = unsigned char *_pix | ||
398 | + ; r1 = int _ystride | ||
399 | + ; r2 = int _ll | ||
400 | + ; preserves r0-r11 | ||
401 | + STMFD r13!,{r4-r11,r14} | ||
402 | + LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1> | ||
403 | + LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0> | ||
404 | + LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2> | ||
405 | + MVN r14,r6 ; r14= ~p1 | ||
406 | + LDRD r10,[r0, r1] ; r11,r10= <p7|p3> | ||
407 | + ; Filter the first four columns. | ||
408 | + MVN r12,r8 ; r12= ~p2 | ||
409 | + UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1 | ||
410 | + UHADD8 r12,r12,r6 ; r12= p1+~p2>>1 | ||
411 | + MVN r10, r10 ; r10=~p3 | ||
412 | + MVN r12,r12 ; r12= u1=~p1+p2+1>>1 | ||
413 | + SSUB8 r14,r14,r12 ; r14= m1=v1-u1 | ||
414 | + ; Single issue | ||
415 | + EOR r4, r4, r14 ; r4 = m1^p0 | ||
416 | + EOR r10,r10,r14 ; r10= m1^~p3 | ||
417 | + UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1 | ||
418 | + ; Single issue | ||
419 | + EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1) | ||
420 | + SADD8 r14,r14,r12 ; r14= v1=m1+u1 | ||
421 | + UHADD8 r4, r4, r12 ; r4 = a1+u1>>1 | ||
422 | + MVN r12,r9 ; r12= ~p6 | ||
423 | + UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1 | ||
424 | + ; Filter the second four columns. | ||
425 | + MVN r14,r7 ; r14= ~p5 | ||
426 | + UHADD8 r12,r12,r7 ; r12= p5+~p6>>1 | ||
427 | + UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1 | ||
428 | + MVN r12,r12 ; r12= u2=~p5+p6+1>>1 | ||
429 | + MVN r11,r11 ; r11=~p7 | ||
430 | + SSUB8 r10,r14,r12 ; r10= m2=v2-u2 | ||
431 | + ; Single issue | ||
432 | + EOR r5, r5, r10 ; r5 = m2^p4 | ||
433 | + EOR r11,r11,r10 ; r11= m2^~p7 | ||
434 | + UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1 | ||
435 | + ; Single issue | ||
436 | + EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1) | ||
437 | + ; Single issue | ||
438 | + UHADD8 r5, r5, r12 ; r5 = a2+u2>>1 | ||
439 | + LDR r12,=0x7F7F7F7F ; r12 = {127}x4 | ||
440 | + UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1 | ||
441 | + ; Now split f[i] by sign. | ||
442 | + ; There's no min or max instruction. | ||
443 | + ; We could use SSUB8 and SEL, but this is just as many instructions and | ||
444 | + ; dual issues more (for v7 without NEON). | ||
445 | + UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0 | ||
446 | + UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0 | ||
447 | + UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0) | ||
448 | + UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0) | ||
449 | + UQADD8 r10,r10,r11 | ||
450 | + UQADD8 r4, r4, r14 | ||
451 | + UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) | ||
452 | + UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) | ||
453 | + UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0 | ||
454 | + UQADD8 r6, r6, r10 | ||
455 | + UQSUB8 r8, r8, r10 | ||
456 | + UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0 | ||
457 | + UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L) | ||
458 | + UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L) | ||
459 | + UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0) | ||
460 | + UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0) | ||
461 | + UQADD8 r11,r11,r10 | ||
462 | + UQADD8 r5, r5, r14 | ||
463 | + UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0)) | ||
464 | + UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0)) | ||
465 | + UQADD8 r7, r7, r11 | ||
466 | + UQSUB8 r9, r9, r11 | ||
467 | + UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L) | ||
468 | + STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6] | ||
469 | + UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L) | ||
470 | + STRD r8, [r0] ; [p6:p2] = [r9: r8] | ||
471 | + LDMFD r13!,{r4-r11,PC} | ||
472 | + ENDP | ||
473 | + | ||
474 | +oc_loop_filter_frag_rows_v6 PROC | ||
475 | + ; r0 = _ref_frame_data | ||
476 | + ; r1 = _ystride | ||
477 | + ; r2 = _bv | ||
478 | + ; r3 = _frags | ||
479 | + ; r4 = _fragi0 | ||
480 | + ; r5 = _fragi0_end | ||
481 | + ; r6 = _fragi_top | ||
482 | + ; r7 = _fragi_bot | ||
483 | + ; r8 = _frag_buf_offs | ||
484 | + ; r9 = _nhfrags | ||
485 | + MOV r12,r13 | ||
486 | + STMFD r13!,{r0,r4-r11,r14} | ||
487 | + LDMFD r12,{r4-r9} | ||
488 | + LDR r2, [r2] ; ll = *(int *)_bv | ||
489 | + CMP r4, r5 ; if(_fragi0>=_fragi0_end) | ||
490 | + BGE oslffri_v6_end ; bail | ||
491 | + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) | ||
492 | + BLE oslffri_v6_end ; bail | ||
493 | + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] | ||
494 | + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] | ||
495 | + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; | ||
496 | +oslffri_v6_lp1 | ||
497 | + MOV r10,r4 ; r10= fragi = _fragi0 | ||
498 | + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 | ||
499 | +oslffri_v6_lp2 | ||
500 | + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ | ||
501 | + LDR r0, [r13] ; r0 = _ref_frame_data | ||
502 | + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ | ||
503 | + TST r14,#OC_FRAG_CODED_FLAG | ||
504 | + BEQ oslffri_v6_uncoded | ||
505 | + CMP r10,r4 ; if (fragi>_fragi0) | ||
506 | + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] | ||
507 | + BLGT loop_filter_h_v6 | ||
508 | + CMP r4, r6 ; if (fragi0>_fragi_top) | ||
509 | + BLGT loop_filter_v_v6 | ||
510 | + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) | ||
511 | + LDRLT r12,[r3] ; r12 = _frags[fragi+1] | ||
512 | + ADD r0, r0, #8 | ||
513 | + ADD r10,r10,#1 ; r10 = fragi+1; | ||
514 | + ANDLT r12,r12,#OC_FRAG_CODED_FLAG | ||
515 | + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 | ||
516 | + BLLT loop_filter_h_v6 | ||
517 | + CMP r10,r7 ; if (fragi<_fragi_bot) | ||
518 | + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] | ||
519 | + SUB r0, r0, #8 | ||
520 | + ADD r0, r0, r1, LSL #3 | ||
521 | + ANDLT r12,r12,#OC_FRAG_CODED_FLAG | ||
522 | + CMPLT r12,#OC_FRAG_CODED_FLAG | ||
523 | + BLLT loop_filter_v_v6 | ||
524 | + CMP r10,r11 ; while(fragi<=fragi_end-1) | ||
525 | + BLE oslffri_v6_lp2 | ||
526 | + MOV r4, r10 ; r4 = fragi0 += nhfrags | ||
527 | + CMP r4, r5 | ||
528 | + BLT oslffri_v6_lp1 | ||
529 | +oslffri_v6_end | ||
530 | + LDMFD r13!,{r0,r4-r11,PC} | ||
531 | +oslffri_v6_uncoded | ||
532 | + ADD r10,r10,#1 | ||
533 | + CMP r10,r11 | ||
534 | + BLE oslffri_v6_lp2 | ||
535 | + MOV r4, r10 ; r4 = fragi0 += nhfrags | ||
536 | + CMP r4, r5 | ||
537 | + BLT oslffri_v6_lp1 | ||
538 | + LDMFD r13!,{r0,r4-r11,PC} | ||
539 | + ENDP | ||
540 | + ] | ||
541 | + | ||
542 | + [ OC_ARM_ASM_NEON | ||
543 | + EXPORT oc_loop_filter_init_neon | ||
544 | + EXPORT oc_loop_filter_frag_rows_neon | ||
545 | + | ||
546 | +oc_loop_filter_init_neon PROC | ||
547 | + ; r0 = _bv | ||
548 | + ; r1 = _flimit (=L from the spec) | ||
549 | + MOV r1, r1, LSL #1 ; r1 = 2*L | ||
550 | + VDUP.S16 Q15, r1 ; Q15= 2L in U16s | ||
551 | + VST1.64 {D30,D31}, [r0@128] | ||
552 | + MOV PC,r14 | ||
553 | + ENDP | ||
554 | + | ||
555 | +loop_filter_h_neon PROC | ||
556 | + ; r0 = unsigned char *_pix | ||
557 | + ; r1 = int _ystride | ||
558 | + ; r2 = int *_bv | ||
559 | + ; preserves r0-r3 | ||
560 | + ; We assume Q15= 2*L in U16s | ||
561 | + ; My best guesses at cycle counts (and latency)--vvv | ||
562 | + SUB r12,r0, #2 | ||
563 | + ; Doing a 2-element structure load saves doing two VTRN's below, at the | ||
564 | + ; cost of using two more slower single-lane loads vs. the faster | ||
565 | + ; all-lane loads. | ||
566 | + ; It's less code this way, though, and benches a hair faster, but it | ||
567 | + ; leaves D2 and D4 swapped. | ||
568 | + VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1 | ||
569 | + ; D2 = ____________3322 | ||
570 | + VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1 | ||
571 | + ; D6 = ____________7766 | ||
572 | + VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1 | ||
573 | + ; D2 = ________BBAA3322 | ||
574 | + VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1 | ||
575 | + ; D6 = ________FFEE7766 | ||
576 | + VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1 | ||
577 | + ; D2 = ____JJIIBBAA3322 | ||
578 | + VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1 | ||
579 | + ; D6 = ____NNMMFFEE7766 | ||
580 | + VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1 | ||
581 | + ; D2 = RRQQJJIIBBAA3322 | ||
582 | + VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1 | ||
583 | + ; D6 = VVUUNNMMFFEE7766 | ||
584 | + VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1 | ||
585 | + VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1 | ||
586 | + VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 | ||
587 | + VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3 | ||
588 | + ADD r12,r0, #8 | ||
589 | + VADD.S16 Q0, Q0, Q8 ; 1,3 | ||
590 | + PLD [r12] | ||
591 | + VADD.S16 Q0, Q0, Q8 ; 1,3 | ||
592 | + PLD [r12,r1] | ||
593 | + VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 | ||
594 | + PLD [r12,r1, LSL #1] | ||
595 | + VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 | ||
596 | + ADD r12,r12,r1, LSL #2 | ||
597 | + ; We want to do | ||
598 | + ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) | ||
599 | + ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) | ||
600 | + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) | ||
601 | + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) | ||
602 | + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) | ||
603 | + ; So we've reduced the left and right hand terms to be the same, except | ||
604 | + ; for a negation. | ||
605 | + ; Stall x3 | ||
606 | + VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 | ||
607 | + PLD [r12,-r1] | ||
608 | + VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 | ||
609 | + PLD [r12] | ||
610 | + VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 | ||
611 | + PLD [r12,r1] | ||
612 | + VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 | ||
613 | + PLD [r12,r1,LSL #1] | ||
614 | + VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 | ||
615 | + ADD r12,r12,r1, LSL #2 | ||
616 | + ; Now we need to correct for the sign of f. | ||
617 | + ; For negative elements of Q0, we want to subtract the appropriate | ||
618 | + ; element of Q9. For positive elements we want to add them. No NEON | ||
619 | + ; instruction exists to do this, so we need to negate the negative | ||
620 | + ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b | ||
621 | + VADD.S16 Q9, Q9, Q0 ; 1,3 | ||
622 | + PLD [r12,-r1] | ||
623 | + VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 | ||
624 | + ; Bah. No VRSBW.U8 | ||
625 | + ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) | ||
626 | + VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 | ||
627 | + VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 | ||
628 | + VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1 | ||
629 | + VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1 | ||
630 | + SUB r12,r0, #1 | ||
631 | + VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1 | ||
632 | + VST1.16 {D4[0]}, [r12], r1 | ||
633 | + VST1.16 {D2[0]}, [r12], r1 | ||
634 | + VST1.16 {D4[1]}, [r12], r1 | ||
635 | + VST1.16 {D2[1]}, [r12], r1 | ||
636 | + VST1.16 {D4[2]}, [r12], r1 | ||
637 | + VST1.16 {D2[2]}, [r12], r1 | ||
638 | + VST1.16 {D4[3]}, [r12], r1 | ||
639 | + VST1.16 {D2[3]}, [r12], r1 | ||
640 | + MOV PC,r14 | ||
641 | + ENDP | ||
642 | + | ||
643 | +loop_filter_v_neon PROC | ||
644 | + ; r0 = unsigned char *_pix | ||
645 | + ; r1 = int _ystride | ||
646 | + ; r2 = int *_bv | ||
647 | + ; preserves r0-r3 | ||
648 | + ; We assume Q15= 2*L in U16s | ||
649 | + ; My best guesses at cycle counts (and latency)--vvv | ||
650 | + SUB r12,r0, r1, LSL #1 | ||
651 | + VLD1.64 {D0}, [r12@64], r1 ; D0 = SSOOKKGGCC884400 2,1 | ||
652 | + VLD1.64 {D2}, [r12@64], r1 ; D2 = TTPPLLHHDD995511 2,1 | ||
653 | + VLD1.64 {D4}, [r12@64], r1 ; D4 = UUQQMMIIEEAA6622 2,1 | ||
654 | + VLD1.64 {D6}, [r12@64] ; D6 = VVRRNNJJFFBB7733 2,1 | ||
655 | + VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3 | ||
656 | + VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3 | ||
657 | + ADD r12, #8 | ||
658 | + VADD.S16 Q0, Q0, Q8 ; 1,3 | ||
659 | + PLD [r12] | ||
660 | + VADD.S16 Q0, Q0, Q8 ; 1,3 | ||
661 | + PLD [r12,r1] | ||
662 | + VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3 | ||
663 | + SUB r12, r0, r1 | ||
664 | + VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4 | ||
665 | + ; We want to do | ||
666 | + ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0)) | ||
667 | + ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0))) | ||
668 | + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0))) | ||
669 | + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0))) | ||
670 | + ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0))) | ||
671 | + ; So we've reduced the left and right hand terms to be the same, except | ||
672 | + ; for a negation. | ||
673 | + ; Stall x3 | ||
674 | + VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4 | ||
675 | + VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3 | ||
676 | + ; Stall x2 | ||
677 | + VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4 | ||
678 | + VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3 | ||
679 | + ; Stall x2 | ||
680 | + VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4 | ||
681 | + ; Now we need to correct for the sign of f. | ||
682 | + ; For negative elements of Q0, we want to subtract the appropriate | ||
683 | + ; element of Q9. For positive elements we want to add them. No NEON | ||
684 | + ; instruction exists to do this, so we need to negate the negative | ||
685 | + ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b | ||
686 | + ; Stall x3 | ||
687 | + VADD.S16 Q9, Q9, Q0 ; 1,3 | ||
688 | + ; Stall x2 | ||
689 | + VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3 | ||
690 | + ; Bah. No VRSBW.U8 | ||
691 | + ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.) | ||
692 | + VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3 | ||
693 | + VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3 | ||
694 | + VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1 | ||
695 | + VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1 | ||
696 | + VST1.64 {D2}, [r12@64], r1 | ||
697 | + VST1.64 {D4}, [r12@64], r1 | ||
698 | + MOV PC,r14 | ||
699 | + ENDP | ||
700 | + | ||
701 | +oc_loop_filter_frag_rows_neon PROC | ||
702 | + ; r0 = _ref_frame_data | ||
703 | + ; r1 = _ystride | ||
704 | + ; r2 = _bv | ||
705 | + ; r3 = _frags | ||
706 | + ; r4 = _fragi0 | ||
707 | + ; r5 = _fragi0_end | ||
708 | + ; r6 = _fragi_top | ||
709 | + ; r7 = _fragi_bot | ||
710 | + ; r8 = _frag_buf_offs | ||
711 | + ; r9 = _nhfrags | ||
712 | + MOV r12,r13 | ||
713 | + STMFD r13!,{r0,r4-r11,r14} | ||
714 | + LDMFD r12,{r4-r9} | ||
715 | + CMP r4, r5 ; if(_fragi0>=_fragi0_end) | ||
716 | + BGE oslffri_neon_end; bail | ||
717 | + SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0) | ||
718 | + BLE oslffri_neon_end ; bail | ||
719 | + VLD1.64 {D30,D31}, [r2@128] ; Q15= 2L in U16s | ||
720 | + ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi] | ||
721 | + ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi] | ||
722 | + SUB r7, r7, r9 ; _fragi_bot -= _nhfrags; | ||
723 | +oslffri_neon_lp1 | ||
724 | + MOV r10,r4 ; r10= fragi = _fragi0 | ||
725 | + ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1 | ||
726 | +oslffri_neon_lp2 | ||
727 | + LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++ | ||
728 | + LDR r0, [r13] ; r0 = _ref_frame_data | ||
729 | + LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++ | ||
730 | + TST r14,#OC_FRAG_CODED_FLAG | ||
731 | + BEQ oslffri_neon_uncoded | ||
732 | + CMP r10,r4 ; if (fragi>_fragi0) | ||
733 | + ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi] | ||
734 | + BLGT loop_filter_h_neon | ||
735 | + CMP r4, r6 ; if (_fragi0>_fragi_top) | ||
736 | + BLGT loop_filter_v_neon | ||
737 | + CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1) | ||
738 | + LDRLT r12,[r3] ; r12 = _frags[fragi+1] | ||
739 | + ADD r0, r0, #8 | ||
740 | + ADD r10,r10,#1 ; r10 = fragi+1; | ||
741 | + ANDLT r12,r12,#OC_FRAG_CODED_FLAG | ||
742 | + CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0 | ||
743 | + BLLT loop_filter_h_neon | ||
744 | + CMP r10,r7 ; if (fragi<_fragi_bot) | ||
745 | + LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1] | ||
746 | + SUB r0, r0, #8 | ||
747 | + ADD r0, r0, r1, LSL #3 | ||
748 | + ANDLT r12,r12,#OC_FRAG_CODED_FLAG | ||
749 | + CMPLT r12,#OC_FRAG_CODED_FLAG | ||
750 | + BLLT loop_filter_v_neon | ||
751 | + CMP r10,r11 ; while(fragi<=fragi_end-1) | ||
752 | + BLE oslffri_neon_lp2 | ||
753 | + MOV r4, r10 ; r4 = _fragi0 += _nhfrags | ||
754 | + CMP r4, r5 | ||
755 | + BLT oslffri_neon_lp1 | ||
756 | +oslffri_neon_end | ||
757 | + LDMFD r13!,{r0,r4-r11,PC} | ||
758 | +oslffri_neon_uncoded | ||
759 | + ADD r10,r10,#1 | ||
760 | + CMP r10,r11 | ||
761 | + BLE oslffri_neon_lp2 | ||
762 | + MOV r4, r10 ; r4 = _fragi0 += _nhfrags | ||
763 | + CMP r4, r5 | ||
764 | + BLT oslffri_neon_lp1 | ||
765 | + LDMFD r13!,{r0,r4-r11,PC} | ||
766 | + ENDP | ||
767 | + ] | ||
768 | + | ||
769 | + END | ||