diff options
Diffstat (limited to 'meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch')
| -rw-r--r-- | meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch new file mode 100644 index 0000000000..dc8a69f749 --- /dev/null +++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch | |||
| @@ -0,0 +1,235 @@ | |||
| 1 | From 524d1cc7acb753167fffdd08d8c10bf71e0634ba Mon Sep 17 00:00:00 2001 | ||
| 2 | From: Taekyun Kim <tkq.kim@samsung.com> | ||
| 3 | Date: Tue, 20 Sep 2011 21:32:35 +0900 | ||
| 4 | Subject: [PATCH 4/8] ARM: NEON: Bilinear macro template for instruction scheduling | ||
| 5 | |||
| 6 | This macro template takes 6 code blocks. | ||
| 7 | |||
| 8 | 1. process_last_pixel | ||
| 9 | 2. process_two_pixels | ||
| 10 | 3. process_four_pixels | ||
| 11 | 4. process_pixblock_head | ||
| 12 | 5. process_pixblock_tail | ||
| 13 | 6. process_pixblock_tail_head | ||
| 14 | |||
| 15 | process_last_pixel does not need to update horizontal weight. This | ||
| 16 | is done by the template. two and four code block should update | ||
| 17 | horizontal weight inside of them. head/tail/tail_head blocks | ||
| 18 | consist unrolled core loop. You can apply instruction scheduling | ||
| 19 | to the tail_head blocks. | ||
| 20 | |||
| 21 | You can also specify size of the pixel block. Supported size is 4 | ||
| 22 | and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags | ||
| 23 | to the template, then you can use register MASK. When using d8~d15 | ||
| 24 | registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure | ||
| 25 | registers are properly saved on the stack and later restored. | ||
| 26 | --- | ||
| 27 | pixman/pixman-arm-neon-asm-bilinear.S | 195 +++++++++++++++++++++++++++++++++ | ||
| 28 | 1 files changed, 195 insertions(+), 0 deletions(-) | ||
| 29 | |||
| 30 | diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S | ||
| 31 | index c5ba929..784e5df 100644 | ||
| 32 | --- a/pixman/pixman-arm-neon-asm-bilinear.S | ||
| 33 | +++ b/pixman/pixman-arm-neon-asm-bilinear.S | ||
| 34 | @@ -773,3 +773,198 @@ generate_bilinear_scanline_func_src_a8_dst \ | ||
| 35 | generate_bilinear_scanline_func_src_a8_dst \ | ||
| 36 | pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \ | ||
| 37 | 8888, 8888, add, 2, 28 | ||
| 38 | + | ||
| 39 | +.set BILINEAR_FLAG_USE_MASK, 1 | ||
| 40 | +.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 | ||
| 41 | + | ||
| 42 | +/* | ||
| 43 | + * Main template macro for generating NEON optimized bilinear scanline functions. | ||
| 44 | + * | ||
| 45 | + * Bilinear scanline generator macro take folling arguments: | ||
| 46 | + * fname - name of the function to generate | ||
| 47 | + * src_fmt - source color format (8888 or 0565) | ||
| 48 | + * dst_fmt - destination color format (8888 or 0565) | ||
| 49 | + * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes | ||
| 50 | + * process_last_pixel - code block that interpolate one pixel and does not | ||
| 51 | + * update horizontal weight | ||
| 52 | + * process_two_pixels - code block that interpolate two pixels and update | ||
| 53 | + * horizontal weight | ||
| 54 | + * process_four_pixels - code block that interpolate four pixels and update | ||
| 55 | + * horizontal weight | ||
| 56 | + * process_pixblock_head - head part of middle loop | ||
| 57 | + * process_pixblock_tail - tail part of middle loop | ||
| 58 | + * process_pixblock_tail_head - tail_head of middle loop | ||
| 59 | + * pixblock_size - number of pixels processed in a single middle loop | ||
| 60 | + * prefetch_distance - prefetch in the source image by that many pixels ahead | ||
| 61 | + */ | ||
| 62 | + | ||
| 63 | +.macro generate_bilinear_scanline_func \ | ||
| 64 | + fname, \ | ||
| 65 | + src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ | ||
| 66 | + bilinear_process_last_pixel, \ | ||
| 67 | + bilinear_process_two_pixels, \ | ||
| 68 | + bilinear_process_four_pixels, \ | ||
| 69 | + bilinear_process_pixblock_head, \ | ||
| 70 | + bilinear_process_pixblock_tail, \ | ||
| 71 | + bilinear_process_pixblock_tail_head, \ | ||
| 72 | + pixblock_size, \ | ||
| 73 | + prefetch_distance, \ | ||
| 74 | + flags | ||
| 75 | + | ||
| 76 | +pixman_asm_function fname | ||
| 77 | +.if pixblock_size == 8 | ||
| 78 | +.elseif pixblock_size == 4 | ||
| 79 | +.else | ||
| 80 | + .error unsupported pixblock size | ||
| 81 | +.endif | ||
| 82 | + | ||
| 83 | +.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 | ||
| 84 | + OUT .req r0 | ||
| 85 | + TOP .req r1 | ||
| 86 | + BOTTOM .req r2 | ||
| 87 | + WT .req r3 | ||
| 88 | + WB .req r4 | ||
| 89 | + X .req r5 | ||
| 90 | + UX .req r6 | ||
| 91 | + WIDTH .req ip | ||
| 92 | + TMP1 .req r3 | ||
| 93 | + TMP2 .req r4 | ||
| 94 | + PF_OFFS .req r7 | ||
| 95 | + TMP3 .req r8 | ||
| 96 | + TMP4 .req r9 | ||
| 97 | + STRIDE .req r2 | ||
| 98 | + | ||
| 99 | + mov ip, sp | ||
| 100 | + push {r4, r5, r6, r7, r8, r9} | ||
| 101 | + mov PF_OFFS, #prefetch_distance | ||
| 102 | + ldmia ip, {WB, X, UX, WIDTH} | ||
| 103 | +.else | ||
| 104 | + OUT .req r0 | ||
| 105 | + MASK .req r1 | ||
| 106 | + TOP .req r2 | ||
| 107 | + BOTTOM .req r3 | ||
| 108 | + WT .req r4 | ||
| 109 | + WB .req r5 | ||
| 110 | + X .req r6 | ||
| 111 | + UX .req r7 | ||
| 112 | + WIDTH .req ip | ||
| 113 | + TMP1 .req r4 | ||
| 114 | + TMP2 .req r5 | ||
| 115 | + PF_OFFS .req r8 | ||
| 116 | + TMP3 .req r9 | ||
| 117 | + TMP4 .req r10 | ||
| 118 | + STRIDE .req r3 | ||
| 119 | + | ||
| 120 | + mov ip, sp | ||
| 121 | + push {r4, r5, r6, r7, r8, r9, r10, ip} | ||
| 122 | + mov PF_OFFS, #prefetch_distance | ||
| 123 | + ldmia ip, {WT, WB, X, UX, WIDTH} | ||
| 124 | +.endif | ||
| 125 | + | ||
| 126 | + mul PF_OFFS, PF_OFFS, UX | ||
| 127 | + | ||
| 128 | +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 | ||
| 129 | + vpush {d8-d15} | ||
| 130 | +.endif | ||
| 131 | + | ||
| 132 | + sub STRIDE, BOTTOM, TOP | ||
| 133 | + .unreq BOTTOM | ||
| 134 | + | ||
| 135 | + cmp WIDTH, #0 | ||
| 136 | + ble 3f | ||
| 137 | + | ||
| 138 | + vdup.u16 q12, X | ||
| 139 | + vdup.u16 q13, UX | ||
| 140 | + vdup.u8 d28, WT | ||
| 141 | + vdup.u8 d29, WB | ||
| 142 | + vadd.u16 d25, d25, d26 | ||
| 143 | + | ||
| 144 | + /* ensure good destination alignment */ | ||
| 145 | + cmp WIDTH, #1 | ||
| 146 | + blt 0f | ||
| 147 | + tst OUT, #(1 << dst_bpp_shift) | ||
| 148 | + beq 0f | ||
| 149 | + vshr.u16 q15, q12, #8 | ||
| 150 | + vadd.u16 q12, q12, q13 | ||
| 151 | + bilinear_process_last_pixel | ||
| 152 | + sub WIDTH, WIDTH, #1 | ||
| 153 | +0: | ||
| 154 | + vadd.u16 q13, q13, q13 | ||
| 155 | + vshr.u16 q15, q12, #8 | ||
| 156 | + vadd.u16 q12, q12, q13 | ||
| 157 | + | ||
| 158 | + cmp WIDTH, #2 | ||
| 159 | + blt 0f | ||
| 160 | + tst OUT, #(1 << (dst_bpp_shift + 1)) | ||
| 161 | + beq 0f | ||
| 162 | + bilinear_process_two_pixels | ||
| 163 | + sub WIDTH, WIDTH, #2 | ||
| 164 | +0: | ||
| 165 | +.if pixblock_size == 8 | ||
| 166 | + cmp WIDTH, #4 | ||
| 167 | + blt 0f | ||
| 168 | + tst OUT, #(1 << (dst_bpp_shift + 2)) | ||
| 169 | + beq 0f | ||
| 170 | + bilinear_process_four_pixels | ||
| 171 | + sub WIDTH, WIDTH, #4 | ||
| 172 | +0: | ||
| 173 | +.endif | ||
| 174 | + subs WIDTH, WIDTH, #pixblock_size | ||
| 175 | + blt 1f | ||
| 176 | + mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) | ||
| 177 | + bilinear_process_pixblock_head | ||
| 178 | + subs WIDTH, WIDTH, #pixblock_size | ||
| 179 | + blt 5f | ||
| 180 | +0: | ||
| 181 | + bilinear_process_pixblock_tail_head | ||
| 182 | + subs WIDTH, WIDTH, #pixblock_size | ||
| 183 | + bge 0b | ||
| 184 | +5: | ||
| 185 | + bilinear_process_pixblock_tail | ||
| 186 | +1: | ||
| 187 | +.if pixblock_size == 8 | ||
| 188 | + tst WIDTH, #4 | ||
| 189 | + beq 2f | ||
| 190 | + bilinear_process_four_pixels | ||
| 191 | +2: | ||
| 192 | +.endif | ||
| 193 | + /* handle the remaining trailing pixels */ | ||
| 194 | + tst WIDTH, #2 | ||
| 195 | + beq 2f | ||
| 196 | + bilinear_process_two_pixels | ||
| 197 | +2: | ||
| 198 | + tst WIDTH, #1 | ||
| 199 | + beq 3f | ||
| 200 | + bilinear_process_last_pixel | ||
| 201 | +3: | ||
| 202 | +.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 | ||
| 203 | + vpop {d8-d15} | ||
| 204 | +.endif | ||
| 205 | + | ||
| 206 | +.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 | ||
| 207 | + pop {r4, r5, r6, r7, r8, r9} | ||
| 208 | +.else | ||
| 209 | + pop {r4, r5, r6, r7, r8, r9, r10, ip} | ||
| 210 | +.endif | ||
| 211 | + bx lr | ||
| 212 | + | ||
| 213 | + .unreq OUT | ||
| 214 | + .unreq TOP | ||
| 215 | + .unreq WT | ||
| 216 | + .unreq WB | ||
| 217 | + .unreq X | ||
| 218 | + .unreq UX | ||
| 219 | + .unreq WIDTH | ||
| 220 | + .unreq TMP1 | ||
| 221 | + .unreq TMP2 | ||
| 222 | + .unreq PF_OFFS | ||
| 223 | + .unreq TMP3 | ||
| 224 | + .unreq TMP4 | ||
| 225 | + .unreq STRIDE | ||
| 226 | +.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 | ||
| 227 | + .unreq MASK | ||
| 228 | +.endif | ||
| 229 | + | ||
| 230 | +.endfunc | ||
| 231 | + | ||
| 232 | +.endm | ||
| 233 | -- | ||
| 234 | 1.6.6.1 | ||
| 235 | |||
