summaryrefslogtreecommitdiffstats
path: root/meta-oe/recipes-graphics
diff options
context:
space:
mode:
Diffstat (limited to 'meta-oe/recipes-graphics')
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0001-ARM-NEON-Standard-fast-path-src_n_8_8888.patch129
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0002-ARM-NEON-Standard-fast-path-src_n_8_8.patch118
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch331
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch235
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0005-ARM-NEON-Replace-old-bilinear-scanline-generator-wit.patch520
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0006-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch186
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch206
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0008-Generic-C-implementation-of-pixman_blt-with-overlapp.patch114
-rw-r--r--meta-oe/recipes-graphics/xorg-lib/pixman_0.23.6.bb29
9 files changed, 1868 insertions, 0 deletions
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0001-ARM-NEON-Standard-fast-path-src_n_8_8888.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0001-ARM-NEON-Standard-fast-path-src_n_8_8888.patch
new file mode 100644
index 000000000..53e9d72cd
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0001-ARM-NEON-Standard-fast-path-src_n_8_8888.patch
@@ -0,0 +1,129 @@
1From f7d1d45e30b59b513d48294de50dc86af60ea68c Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Mon, 26 Sep 2011 17:03:54 +0900
4Subject: [PATCH 1/8] ARM: NEON: Standard fast path src_n_8_8888
5
6Performance numbers of before/after on cortex-a8 @ 1GHz
7
8- before
9L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s)
10
11- after
12L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s)
13---
14 pixman/pixman-arm-neon-asm.S | 73 ++++++++++++++++++++++++++++++++++++++++++
15 pixman/pixman-arm-neon.c | 7 ++++
16 2 files changed, 80 insertions(+), 0 deletions(-)
17
18diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
19index 3fcd07d..1db02db 100644
20--- a/pixman/pixman-arm-neon-asm.S
21+++ b/pixman/pixman-arm-neon-asm.S
22@@ -1219,6 +1219,79 @@ generate_composite_function \
23
24 /******************************************************************************/
25
26+.macro pixman_composite_src_n_8_8888_process_pixblock_head
27+ /* expecting solid source in {d0, d1, d2, d3} */
28+ /* mask is in d24 (d25, d26, d27 are unused) */
29+
30+ /* in */
31+ vmull.u8 q8, d24, d0
32+ vmull.u8 q9, d24, d1
33+ vmull.u8 q10, d24, d2
34+ vmull.u8 q11, d24, d3
35+ vrsra.u16 q8, q8, #8
36+ vrsra.u16 q9, q9, #8
37+ vrsra.u16 q10, q10, #8
38+ vrsra.u16 q11, q11, #8
39+.endm
40+
41+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
42+ vrshrn.u16 d28, q8, #8
43+ vrshrn.u16 d29, q9, #8
44+ vrshrn.u16 d30, q10, #8
45+ vrshrn.u16 d31, q11, #8
46+.endm
47+
48+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
49+ fetch_mask_pixblock
50+ PF add PF_X, PF_X, #8
51+ vrshrn.u16 d28, q8, #8
52+ PF tst PF_CTL, #0x0F
53+ vrshrn.u16 d29, q9, #8
54+ PF addne PF_X, PF_X, #8
55+ vrshrn.u16 d30, q10, #8
56+ PF subne PF_CTL, PF_CTL, #1
57+ vrshrn.u16 d31, q11, #8
58+ PF cmp PF_X, ORIG_W
59+ vmull.u8 q8, d24, d0
60+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
61+ vmull.u8 q9, d24, d1
62+ PF subge PF_X, PF_X, ORIG_W
63+ vmull.u8 q10, d24, d2
64+ PF subges PF_CTL, PF_CTL, #0x10
65+ vmull.u8 q11, d24, d3
66+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
67+ vst4.8 {d28, d29, d30, d31}, [DST_W :128]!
68+ vrsra.u16 q8, q8, #8
69+ vrsra.u16 q9, q9, #8
70+ vrsra.u16 q10, q10, #8
71+ vrsra.u16 q11, q11, #8
72+.endm
73+
74+.macro pixman_composite_src_n_8_8888_init
75+ add DUMMY, sp, #ARGS_STACK_OFFSET
76+ vld1.32 {d3[0]}, [DUMMY]
77+ vdup.8 d0, d3[0]
78+ vdup.8 d1, d3[1]
79+ vdup.8 d2, d3[2]
80+ vdup.8 d3, d3[3]
81+.endm
82+
83+.macro pixman_composite_src_n_8_8888_cleanup
84+.endm
85+
86+generate_composite_function \
87+ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
88+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
89+ 8, /* number of pixels, processed in a single block */ \
90+ 5, /* prefetch distance */ \
91+ pixman_composite_src_n_8_8888_init, \
92+ pixman_composite_src_n_8_8888_cleanup, \
93+ pixman_composite_src_n_8_8888_process_pixblock_head, \
94+ pixman_composite_src_n_8_8888_process_pixblock_tail, \
95+ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
96+
97+/******************************************************************************/
98+
99 .macro pixman_composite_over_n_8_8888_process_pixblock_head
100 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
101 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
102diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
103index effb50b..3db9adf 100644
104--- a/pixman/pixman-arm-neon.c
105+++ b/pixman/pixman-arm-neon.c
106@@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
107 uint8_t, 1, uint8_t, 1)
108 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
109 uint8_t, 1, uint32_t, 1)
110+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
111+ uint8_t, 1, uint32_t, 1)
112
113 PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
114 uint32_t, 1, uint32_t, 1)
115@@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
116 PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888),
117 PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888),
118 PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888),
119+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, neon_composite_src_n_8_8888),
120+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
121+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
122+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
123+
124 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
125 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
126 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
127--
1281.6.6.1
129
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0002-ARM-NEON-Standard-fast-path-src_n_8_8.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0002-ARM-NEON-Standard-fast-path-src_n_8_8.patch
new file mode 100644
index 000000000..d0a4b4546
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0002-ARM-NEON-Standard-fast-path-src_n_8_8.patch
@@ -0,0 +1,118 @@
1From fc92ad56c5218157a097f6ed0c06196be9f74906 Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Mon, 26 Sep 2011 18:33:27 +0900
4Subject: [PATCH 2/8] ARM: NEON: Standard fast path src_n_8_8
5
6Performance numbers of before/after on cortex-a8 @ 1GHz
7
8- before
9L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s)
10
11- after
12L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s)
13---
14 pixman/pixman-arm-neon-asm.S | 66 ++++++++++++++++++++++++++++++++++++++++++
15 pixman/pixman-arm-neon.c | 3 ++
16 2 files changed, 69 insertions(+), 0 deletions(-)
17
18diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
19index 1db02db..da8f054 100644
20--- a/pixman/pixman-arm-neon-asm.S
21+++ b/pixman/pixman-arm-neon-asm.S
22@@ -1292,6 +1292,72 @@ generate_composite_function \
23
24 /******************************************************************************/
25
26+.macro pixman_composite_src_n_8_8_process_pixblock_head
27+ vmull.u8 q0, d24, d16
28+ vmull.u8 q1, d25, d16
29+ vmull.u8 q2, d26, d16
30+ vmull.u8 q3, d27, d16
31+ vrsra.u16 q0, q0, #8
32+ vrsra.u16 q1, q1, #8
33+ vrsra.u16 q2, q2, #8
34+ vrsra.u16 q3, q3, #8
35+.endm
36+
37+.macro pixman_composite_src_n_8_8_process_pixblock_tail
38+ vrshrn.u16 d28, q0, #8
39+ vrshrn.u16 d29, q1, #8
40+ vrshrn.u16 d30, q2, #8
41+ vrshrn.u16 d31, q3, #8
42+.endm
43+
44+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
45+ fetch_mask_pixblock
46+ PF add PF_X, PF_X, #8
47+ vrshrn.u16 d28, q0, #8
48+ PF tst PF_CTL, #0x0F
49+ vrshrn.u16 d29, q1, #8
50+ PF addne PF_X, PF_X, #8
51+ vrshrn.u16 d30, q2, #8
52+ PF subne PF_CTL, PF_CTL, #1
53+ vrshrn.u16 d31, q3, #8
54+ PF cmp PF_X, ORIG_W
55+ vmull.u8 q0, d24, d16
56+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
57+ vmull.u8 q1, d25, d16
58+ PF subge PF_X, PF_X, ORIG_W
59+ vmull.u8 q2, d26, d16
60+ PF subges PF_CTL, PF_CTL, #0x10
61+ vmull.u8 q3, d27, d16
62+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
63+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
64+ vrsra.u16 q0, q0, #8
65+ vrsra.u16 q1, q1, #8
66+ vrsra.u16 q2, q2, #8
67+ vrsra.u16 q3, q3, #8
68+.endm
69+
70+.macro pixman_composite_src_n_8_8_init
71+ add DUMMY, sp, #ARGS_STACK_OFFSET
72+ vld1.32 {d16[0]}, [DUMMY]
73+ vdup.8 d16, d16[3]
74+.endm
75+
76+.macro pixman_composite_src_n_8_8_cleanup
77+.endm
78+
79+generate_composite_function \
80+ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
81+ FLAG_DST_WRITEONLY, \
82+ 32, /* number of pixels, processed in a single block */ \
83+ 5, /* prefetch distance */ \
84+ pixman_composite_src_n_8_8_init, \
85+ pixman_composite_src_n_8_8_cleanup, \
86+ pixman_composite_src_n_8_8_process_pixblock_head, \
87+ pixman_composite_src_n_8_8_process_pixblock_tail, \
88+ pixman_composite_src_n_8_8_process_pixblock_tail_head
89+
90+/******************************************************************************/
91+
92 .macro pixman_composite_over_n_8_8888_process_pixblock_head
93 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
94 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
95diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
96index 3db9adf..ca139de 100644
97--- a/pixman/pixman-arm-neon.c
98+++ b/pixman/pixman-arm-neon.c
99@@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
100 uint8_t, 1, uint32_t, 1)
101 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
102 uint8_t, 1, uint32_t, 1)
103+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
104+ uint8_t, 1, uint8_t, 1)
105
106 PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
107 uint32_t, 1, uint32_t, 1)
108@@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
109 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
110 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
111 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
112+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, neon_composite_src_n_8_8),
113
114 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
115 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
116--
1171.6.6.1
118
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch
new file mode 100644
index 000000000..338e2ad83
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch
@@ -0,0 +1,331 @@
1From ed7580525054e6a543694088c561dee525b4ae28 Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Tue, 20 Sep 2011 19:46:25 +0900
4Subject: [PATCH 3/8] ARM: NEON: Some cleanup of bilinear scanline functions
5
6Use STRIDE and initial horizontal weight update is done before
7entering interpolation loop. Cache preload for mask and dst.
8---
9 pixman/pixman-arm-neon-asm-bilinear.S | 128 +++++++++++++++++----------------
10 1 files changed, 67 insertions(+), 61 deletions(-)
11
12diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
13index 3c7fe0f..c5ba929 100644
14--- a/pixman/pixman-arm-neon-asm-bilinear.S
15+++ b/pixman/pixman-arm-neon-asm-bilinear.S
16@@ -44,10 +44,6 @@
17 * All temp registers can be used freely outside the code block.
18 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
19 *
20- * TODOs
21- * Support 0565 pixel format
22- * Optimization for two and last pixel cases
23- *
24 * Remarks
25 * There can be lots of pipeline stalls inside code block and between code blocks.
26 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
27@@ -92,21 +88,19 @@ fname:
28 */
29
30 .macro bilinear_load_8888 reg1, reg2, tmp
31- mov TMP2, X, asr #16
32+ mov TMP1, X, asr #16
33 add X, X, UX
34- add TMP1, TOP, TMP2, asl #2
35- add TMP2, BOTTOM, TMP2, asl #2
36- vld1.32 {reg1}, [TMP1]
37- vld1.32 {reg2}, [TMP2]
38+ add TMP1, TOP, TMP1, asl #2
39+ vld1.32 {reg1}, [TMP1], STRIDE
40+ vld1.32 {reg2}, [TMP1]
41 .endm
42
43 .macro bilinear_load_0565 reg1, reg2, tmp
44- mov TMP2, X, asr #16
45+ mov TMP1, X, asr #16
46 add X, X, UX
47- add TMP1, TOP, TMP2, asl #1
48- add TMP2, BOTTOM, TMP2, asl #1
49- vld1.32 {reg2[0]}, [TMP1]
50- vld1.32 {reg2[1]}, [TMP2]
51+ add TMP1, TOP, TMP1, asl #1
52+ vld1.32 {reg2[0]}, [TMP1], STRIDE
53+ vld1.32 {reg2[1]}, [TMP1]
54 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
55 .endm
56
57@@ -134,18 +128,16 @@ fname:
58 .macro bilinear_load_and_vertical_interpolate_two_0565 \
59 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
60
61- mov TMP2, X, asr #16
62+ mov TMP1, X, asr #16
63 add X, X, UX
64- mov TMP4, X, asr #16
65+ add TMP1, TOP, TMP1, asl #1
66+ mov TMP2, X, asr #16
67 add X, X, UX
68- add TMP1, TOP, TMP2, asl #1
69- add TMP2, BOTTOM, TMP2, asl #1
70- add TMP3, TOP, TMP4, asl #1
71- add TMP4, BOTTOM, TMP4, asl #1
72- vld1.32 {acc2lo[0]}, [TMP1]
73- vld1.32 {acc2hi[0]}, [TMP3]
74- vld1.32 {acc2lo[1]}, [TMP2]
75- vld1.32 {acc2hi[1]}, [TMP4]
76+ add TMP2, TOP, TMP2, asl #1
77+ vld1.32 {acc2lo[0]}, [TMP1], STRIDE
78+ vld1.32 {acc2hi[0]}, [TMP2], STRIDE
79+ vld1.32 {acc2lo[1]}, [TMP1]
80+ vld1.32 {acc2hi[1]}, [TMP2]
81 convert_0565_to_x888 acc2, reg3, reg2, reg1
82 vzip.u8 reg1, reg3
83 vzip.u8 reg2, reg4
84@@ -161,34 +153,30 @@ fname:
85 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
86 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
87
88- mov TMP2, X, asr #16
89+ mov TMP1, X, asr #16
90 add X, X, UX
91- mov TMP4, X, asr #16
92+ add TMP1, TOP, TMP1, asl #1
93+ mov TMP2, X, asr #16
94 add X, X, UX
95- add TMP1, TOP, TMP2, asl #1
96- add TMP2, BOTTOM, TMP2, asl #1
97- add TMP3, TOP, TMP4, asl #1
98- add TMP4, BOTTOM, TMP4, asl #1
99- vld1.32 {xacc2lo[0]}, [TMP1]
100- vld1.32 {xacc2hi[0]}, [TMP3]
101- vld1.32 {xacc2lo[1]}, [TMP2]
102- vld1.32 {xacc2hi[1]}, [TMP4]
103+ add TMP2, TOP, TMP2, asl #1
104+ vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
105+ vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
106+ vld1.32 {xacc2lo[1]}, [TMP1]
107+ vld1.32 {xacc2hi[1]}, [TMP2]
108 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
109- mov TMP2, X, asr #16
110+ mov TMP1, X, asr #16
111 add X, X, UX
112- mov TMP4, X, asr #16
113+ add TMP1, TOP, TMP1, asl #1
114+ mov TMP2, X, asr #16
115 add X, X, UX
116- add TMP1, TOP, TMP2, asl #1
117- add TMP2, BOTTOM, TMP2, asl #1
118- add TMP3, TOP, TMP4, asl #1
119- add TMP4, BOTTOM, TMP4, asl #1
120- vld1.32 {yacc2lo[0]}, [TMP1]
121+ add TMP2, TOP, TMP2, asl #1
122+ vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
123 vzip.u8 xreg1, xreg3
124- vld1.32 {yacc2hi[0]}, [TMP3]
125+ vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
126 vzip.u8 xreg2, xreg4
127- vld1.32 {yacc2lo[1]}, [TMP2]
128+ vld1.32 {yacc2lo[1]}, [TMP1]
129 vzip.u8 xreg3, xreg4
130- vld1.32 {yacc2hi[1]}, [TMP4]
131+ vld1.32 {yacc2hi[1]}, [TMP2]
132 vzip.u8 xreg1, xreg2
133 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
134 vmull.u8 xacc1, xreg1, d28
135@@ -252,6 +240,7 @@ fname:
136 .else
137 .error bilinear_load_mask_8 numpix is unsupported
138 .endif
139+ pld [MASK, #prefetch_offset]
140 .endm
141
142 .macro bilinear_load_mask mask_fmt, numpix, mask
143@@ -279,6 +268,7 @@ fname:
144 .else
145 .error bilinear_load_dst_8888 numpix is unsupported
146 .endif
147+ pld [OUT, #(prefetch_offset * 4)]
148 .endm
149
150 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
151@@ -303,7 +293,7 @@ fname:
152 * For two pixel case
153 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
154 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
155- * We can do some optimizations for this including one pixel cases.
156+ * We can do some optimizations for this including last pixel cases.
157 */
158 .macro bilinear_duplicate_mask_x numpix, mask
159 .endm
160@@ -497,8 +487,7 @@ fname:
161 bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
162 vmull.u8 q1, d0, d28
163 vmlal.u8 q1, d1, d29
164- vshr.u16 d30, d24, #8
165- /* 4 cycles bubble */
166+ /* 5 cycles bubble */
167 vshll.u16 q0, d2, #8
168 vmlsl.u16 q0, d2, d30
169 vmlal.u16 q0, d3, d30
170@@ -525,18 +514,18 @@ fname:
171 q1, q11, d0, d1, d20, d21, d22, d23
172 bilinear_load_mask mask_fmt, 2, d4
173 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
174- vshr.u16 q15, q12, #8
175- vadd.u16 q12, q12, q13
176 vshll.u16 q0, d2, #8
177 vmlsl.u16 q0, d2, d30
178 vmlal.u16 q0, d3, d30
179 vshll.u16 q10, d22, #8
180 vmlsl.u16 q10, d22, d31
181 vmlal.u16 q10, d23, d31
182- vshrn.u32 d30, q0, #16
183- vshrn.u32 d31, q10, #16
184+ vshrn.u32 d0, q0, #16
185+ vshrn.u32 d1, q10, #16
186 bilinear_duplicate_mask mask_fmt, 2, d4
187- vmovn.u16 d0, q15
188+ vshr.u16 q15, q12, #8
189+ vadd.u16 q12, q12, q13
190+ vmovn.u16 d0, q0
191 bilinear_interleave_src_dst \
192 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
193 bilinear_apply_mask_to_src \
194@@ -554,8 +543,7 @@ fname:
195 q1, q11, d0, d1, d20, d21, d22, d23 \
196 q3, q9, d4, d5, d16, d17, d18, d19
197 pld [TMP1, PF_OFFS]
198- vshr.u16 q15, q12, #8
199- vadd.u16 q12, q12, q13
200+ sub TMP1, TMP1, STRIDE
201 vshll.u16 q0, d2, #8
202 vmlsl.u16 q0, d2, d30
203 vmlal.u16 q0, d3, d30
204@@ -567,9 +555,9 @@ fname:
205 vmlsl.u16 q2, d6, d30
206 vmlal.u16 q2, d7, d30
207 vshll.u16 q8, d18, #8
208- bilinear_load_mask mask_fmt, 4, d30
209+ bilinear_load_mask mask_fmt, 4, d22
210 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
211- pld [TMP2, PF_OFFS]
212+ pld [TMP1, PF_OFFS]
213 vmlsl.u16 q8, d18, d31
214 vmlal.u16 q8, d19, d31
215 vadd.u16 q12, q12, q13
216@@ -577,17 +565,19 @@ fname:
217 vshrn.u32 d1, q10, #16
218 vshrn.u32 d4, q2, #16
219 vshrn.u32 d5, q8, #16
220- bilinear_duplicate_mask mask_fmt, 4, d30
221+ bilinear_duplicate_mask mask_fmt, 4, d22
222+ vshr.u16 q15, q12, #8
223 vmovn.u16 d0, q0
224 vmovn.u16 d1, q2
225+ vadd.u16 q12, q12, q13
226 bilinear_interleave_src_dst \
227 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
228 bilinear_apply_mask_to_src \
229- mask_fmt, 4, d0, d1, q0, d30, \
230+ mask_fmt, 4, d0, d1, q0, d22, \
231 q3, q8, q9, q10
232 bilinear_combine \
233 op, 4, d0, d1, q0, d2, d3, q1, \
234- q3, q8, q9, q10, d22
235+ q3, q8, q9, q10, d23
236 bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
237 bilinear_store_&dst_fmt 4, q2, q3
238 .endm
239@@ -610,6 +600,7 @@ pixman_asm_function fname
240 PF_OFFS .req r7
241 TMP3 .req r8
242 TMP4 .req r9
243+ STRIDE .req r2
244
245 mov ip, sp
246 push {r4, r5, r6, r7, r8, r9}
247@@ -617,6 +608,11 @@ pixman_asm_function fname
248 ldmia ip, {WB, X, UX, WIDTH}
249 mul PF_OFFS, PF_OFFS, UX
250
251+ .set prefetch_offset, prefetch_distance
252+
253+ sub STRIDE, BOTTOM, TOP
254+ .unreq BOTTOM
255+
256 cmp WIDTH, #0
257 ble 3f
258
259@@ -626,6 +622,8 @@ pixman_asm_function fname
260 vdup.u8 d29, WB
261 vadd.u16 d25, d25, d26
262 vadd.u16 q13, q13, q13
263+ vshr.u16 q15, q12, #8
264+ vadd.u16 q12, q12, q13
265
266 subs WIDTH, WIDTH, #4
267 blt 1f
268@@ -648,7 +646,6 @@ pixman_asm_function fname
269
270 .unreq OUT
271 .unreq TOP
272- .unreq BOTTOM
273 .unreq WT
274 .unreq WB
275 .unreq X
276@@ -659,6 +656,7 @@ pixman_asm_function fname
277 .unreq PF_OFFS
278 .unreq TMP3
279 .unreq TMP4
280+ .unreq STRIDE
281 .endfunc
282
283 .endm
284@@ -682,6 +680,7 @@ pixman_asm_function fname
285 PF_OFFS .req r8
286 TMP3 .req r9
287 TMP4 .req r10
288+ STRIDE .req r3
289
290 mov ip, sp
291 push {r4, r5, r6, r7, r8, r9, r10, ip}
292@@ -689,6 +688,11 @@ pixman_asm_function fname
293 ldmia ip, {WT, WB, X, UX, WIDTH}
294 mul PF_OFFS, PF_OFFS, UX
295
296+ .set prefetch_offset, prefetch_distance
297+
298+ sub STRIDE, BOTTOM, TOP
299+ .unreq BOTTOM
300+
301 cmp WIDTH, #0
302 ble 3f
303
304@@ -698,6 +702,8 @@ pixman_asm_function fname
305 vdup.u8 d29, WB
306 vadd.u16 d25, d25, d26
307 vadd.u16 q13, q13, q13
308+ vshr.u16 q15, q12, #8
309+ vadd.u16 q12, q12, q13
310
311 subs WIDTH, WIDTH, #4
312 blt 1f
313@@ -720,7 +726,6 @@ pixman_asm_function fname
314
315 .unreq OUT
316 .unreq TOP
317- .unreq BOTTOM
318 .unreq WT
319 .unreq WB
320 .unreq X
321@@ -732,6 +737,7 @@ pixman_asm_function fname
322 .unreq PF_OFFS
323 .unreq TMP3
324 .unreq TMP4
325+ .unreq STRIDE
326 .endfunc
327
328 .endm
329--
3301.6.6.1
331
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch
new file mode 100644
index 000000000..dc8a69f74
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch
@@ -0,0 +1,235 @@
1From 524d1cc7acb753167fffdd08d8c10bf71e0634ba Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Tue, 20 Sep 2011 21:32:35 +0900
4Subject: [PATCH 4/8] ARM: NEON: Bilinear macro template for instruction scheduling
5
6This macro template takes 6 code blocks.
7
81. process_last_pixel
92. process_two_pixels
103. process_four_pixels
114. process_pixblock_head
125. process_pixblock_tail
136. process_pixblock_tail_head
14
15process_last_pixel does not need to update horizontal weight. This
16is done by the template. two and four code block should update
17horizontal weight inside of them. head/tail/tail_head blocks
18consist unrolled core loop. You can apply instruction scheduling
19to the tail_head blocks.
20
21You can also specify size of the pixel block. Supported size is 4
22and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags
23to the template, then you can use register MASK. When using d8~d15
24registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure
25registers are properly saved on the stack and later restored.
26---
27 pixman/pixman-arm-neon-asm-bilinear.S | 195 +++++++++++++++++++++++++++++++++
28 1 files changed, 195 insertions(+), 0 deletions(-)
29
30diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
31index c5ba929..784e5df 100644
32--- a/pixman/pixman-arm-neon-asm-bilinear.S
33+++ b/pixman/pixman-arm-neon-asm-bilinear.S
34@@ -773,3 +773,198 @@ generate_bilinear_scanline_func_src_a8_dst \
35 generate_bilinear_scanline_func_src_a8_dst \
36 pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
37 8888, 8888, add, 2, 28
38+
39+.set BILINEAR_FLAG_USE_MASK, 1
40+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
41+
42+/*
43+ * Main template macro for generating NEON optimized bilinear scanline functions.
44+ *
45+ * Bilinear scanline generator macro take folling arguments:
46+ * fname - name of the function to generate
47+ * src_fmt - source color format (8888 or 0565)
48+ * dst_fmt - destination color format (8888 or 0565)
49+ * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
50+ * process_last_pixel - code block that interpolate one pixel and does not
51+ * update horizontal weight
52+ * process_two_pixels - code block that interpolate two pixels and update
53+ * horizontal weight
54+ * process_four_pixels - code block that interpolate four pixels and update
55+ * horizontal weight
56+ * process_pixblock_head - head part of middle loop
57+ * process_pixblock_tail - tail part of middle loop
58+ * process_pixblock_tail_head - tail_head of middle loop
59+ * pixblock_size - number of pixels processed in a single middle loop
60+ * prefetch_distance - prefetch in the source image by that many pixels ahead
61+ */
62+
63+.macro generate_bilinear_scanline_func \
64+ fname, \
65+ src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
66+ bilinear_process_last_pixel, \
67+ bilinear_process_two_pixels, \
68+ bilinear_process_four_pixels, \
69+ bilinear_process_pixblock_head, \
70+ bilinear_process_pixblock_tail, \
71+ bilinear_process_pixblock_tail_head, \
72+ pixblock_size, \
73+ prefetch_distance, \
74+ flags
75+
76+pixman_asm_function fname
77+.if pixblock_size == 8
78+.elseif pixblock_size == 4
79+.else
80+ .error unsupported pixblock size
81+.endif
82+
83+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
84+ OUT .req r0
85+ TOP .req r1
86+ BOTTOM .req r2
87+ WT .req r3
88+ WB .req r4
89+ X .req r5
90+ UX .req r6
91+ WIDTH .req ip
92+ TMP1 .req r3
93+ TMP2 .req r4
94+ PF_OFFS .req r7
95+ TMP3 .req r8
96+ TMP4 .req r9
97+ STRIDE .req r2
98+
99+ mov ip, sp
100+ push {r4, r5, r6, r7, r8, r9}
101+ mov PF_OFFS, #prefetch_distance
102+ ldmia ip, {WB, X, UX, WIDTH}
103+.else
104+ OUT .req r0
105+ MASK .req r1
106+ TOP .req r2
107+ BOTTOM .req r3
108+ WT .req r4
109+ WB .req r5
110+ X .req r6
111+ UX .req r7
112+ WIDTH .req ip
113+ TMP1 .req r4
114+ TMP2 .req r5
115+ PF_OFFS .req r8
116+ TMP3 .req r9
117+ TMP4 .req r10
118+ STRIDE .req r3
119+
120+ mov ip, sp
121+ push {r4, r5, r6, r7, r8, r9, r10, ip}
122+ mov PF_OFFS, #prefetch_distance
123+ ldmia ip, {WT, WB, X, UX, WIDTH}
124+.endif
125+
126+ mul PF_OFFS, PF_OFFS, UX
127+
128+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
129+ vpush {d8-d15}
130+.endif
131+
132+ sub STRIDE, BOTTOM, TOP
133+ .unreq BOTTOM
134+
135+ cmp WIDTH, #0
136+ ble 3f
137+
138+ vdup.u16 q12, X
139+ vdup.u16 q13, UX
140+ vdup.u8 d28, WT
141+ vdup.u8 d29, WB
142+ vadd.u16 d25, d25, d26
143+
144+ /* ensure good destination alignment */
145+ cmp WIDTH, #1
146+ blt 0f
147+ tst OUT, #(1 << dst_bpp_shift)
148+ beq 0f
149+ vshr.u16 q15, q12, #8
150+ vadd.u16 q12, q12, q13
151+ bilinear_process_last_pixel
152+ sub WIDTH, WIDTH, #1
153+0:
154+ vadd.u16 q13, q13, q13
155+ vshr.u16 q15, q12, #8
156+ vadd.u16 q12, q12, q13
157+
158+ cmp WIDTH, #2
159+ blt 0f
160+ tst OUT, #(1 << (dst_bpp_shift + 1))
161+ beq 0f
162+ bilinear_process_two_pixels
163+ sub WIDTH, WIDTH, #2
164+0:
165+.if pixblock_size == 8
166+ cmp WIDTH, #4
167+ blt 0f
168+ tst OUT, #(1 << (dst_bpp_shift + 2))
169+ beq 0f
170+ bilinear_process_four_pixels
171+ sub WIDTH, WIDTH, #4
172+0:
173+.endif
174+ subs WIDTH, WIDTH, #pixblock_size
175+ blt 1f
176+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
177+ bilinear_process_pixblock_head
178+ subs WIDTH, WIDTH, #pixblock_size
179+ blt 5f
180+0:
181+ bilinear_process_pixblock_tail_head
182+ subs WIDTH, WIDTH, #pixblock_size
183+ bge 0b
184+5:
185+ bilinear_process_pixblock_tail
186+1:
187+.if pixblock_size == 8
188+ tst WIDTH, #4
189+ beq 2f
190+ bilinear_process_four_pixels
191+2:
192+.endif
193+ /* handle the remaining trailing pixels */
194+ tst WIDTH, #2
195+ beq 2f
196+ bilinear_process_two_pixels
197+2:
198+ tst WIDTH, #1
199+ beq 3f
200+ bilinear_process_last_pixel
201+3:
202+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
203+ vpop {d8-d15}
204+.endif
205+
206+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
207+ pop {r4, r5, r6, r7, r8, r9}
208+.else
209+ pop {r4, r5, r6, r7, r8, r9, r10, ip}
210+.endif
211+ bx lr
212+
213+ .unreq OUT
214+ .unreq TOP
215+ .unreq WT
216+ .unreq WB
217+ .unreq X
218+ .unreq UX
219+ .unreq WIDTH
220+ .unreq TMP1
221+ .unreq TMP2
222+ .unreq PF_OFFS
223+ .unreq TMP3
224+ .unreq TMP4
225+ .unreq STRIDE
226+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
227+ .unreq MASK
228+.endif
229+
230+.endfunc
231+
232+.endm
233--
2341.6.6.1
235
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0005-ARM-NEON-Replace-old-bilinear-scanline-generator-wit.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0005-ARM-NEON-Replace-old-bilinear-scanline-generator-wit.patch
new file mode 100644
index 000000000..77c43f5f9
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0005-ARM-NEON-Replace-old-bilinear-scanline-generator-wit.patch
@@ -0,0 +1,520 @@
1From 10b257b46f379d9c79483acd55c9a13fff130843 Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Fri, 23 Sep 2011 00:03:22 +0900
4Subject: [PATCH 5/8] ARM: NEON: Replace old bilinear scanline generator with new template
5
6Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can
7be replaced with new template just by wrapping existing macros.
8---
9 pixman/pixman-arm-neon-asm-bilinear.S | 484 ++++++++++++++++++++-------------
10 1 files changed, 292 insertions(+), 192 deletions(-)
11
12diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
13index 784e5df..25bcb24 100644
14--- a/pixman/pixman-arm-neon-asm-bilinear.S
15+++ b/pixman/pixman-arm-neon-asm-bilinear.S
16@@ -582,198 +582,6 @@ fname:
17 bilinear_store_&dst_fmt 4, q2, q3
18 .endm
19
20-.macro generate_bilinear_scanline_func_src_dst \
21- fname, src_fmt, dst_fmt, op, \
22- bpp_shift, prefetch_distance
23-
24-pixman_asm_function fname
25- OUT .req r0
26- TOP .req r1
27- BOTTOM .req r2
28- WT .req r3
29- WB .req r4
30- X .req r5
31- UX .req r6
32- WIDTH .req ip
33- TMP1 .req r3
34- TMP2 .req r4
35- PF_OFFS .req r7
36- TMP3 .req r8
37- TMP4 .req r9
38- STRIDE .req r2
39-
40- mov ip, sp
41- push {r4, r5, r6, r7, r8, r9}
42- mov PF_OFFS, #prefetch_distance
43- ldmia ip, {WB, X, UX, WIDTH}
44- mul PF_OFFS, PF_OFFS, UX
45-
46- .set prefetch_offset, prefetch_distance
47-
48- sub STRIDE, BOTTOM, TOP
49- .unreq BOTTOM
50-
51- cmp WIDTH, #0
52- ble 3f
53-
54- vdup.u16 q12, X
55- vdup.u16 q13, UX
56- vdup.u8 d28, WT
57- vdup.u8 d29, WB
58- vadd.u16 d25, d25, d26
59- vadd.u16 q13, q13, q13
60- vshr.u16 q15, q12, #8
61- vadd.u16 q12, q12, q13
62-
63- subs WIDTH, WIDTH, #4
64- blt 1f
65- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
66-0:
67- bilinear_interpolate_four_pixels src_fmt, x, dst_fmt, op
68- subs WIDTH, WIDTH, #4
69- bge 0b
70-1:
71- tst WIDTH, #2
72- beq 2f
73- bilinear_interpolate_two_pixels src_fmt, x, dst_fmt, op
74-2:
75- tst WIDTH, #1
76- beq 3f
77- bilinear_interpolate_last_pixel src_fmt, x, dst_fmt, op
78-3:
79- pop {r4, r5, r6, r7, r8, r9}
80- bx lr
81-
82- .unreq OUT
83- .unreq TOP
84- .unreq WT
85- .unreq WB
86- .unreq X
87- .unreq UX
88- .unreq WIDTH
89- .unreq TMP1
90- .unreq TMP2
91- .unreq PF_OFFS
92- .unreq TMP3
93- .unreq TMP4
94- .unreq STRIDE
95-.endfunc
96-
97-.endm
98-
99-.macro generate_bilinear_scanline_func_src_a8_dst \
100- fname, src_fmt, dst_fmt, op, \
101- bpp_shift, prefetch_distance
102-
103-pixman_asm_function fname
104- OUT .req r0
105- MASK .req r1
106- TOP .req r2
107- BOTTOM .req r3
108- WT .req r4
109- WB .req r5
110- X .req r6
111- UX .req r7
112- WIDTH .req ip
113- TMP1 .req r4
114- TMP2 .req r5
115- PF_OFFS .req r8
116- TMP3 .req r9
117- TMP4 .req r10
118- STRIDE .req r3
119-
120- mov ip, sp
121- push {r4, r5, r6, r7, r8, r9, r10, ip}
122- mov PF_OFFS, #prefetch_distance
123- ldmia ip, {WT, WB, X, UX, WIDTH}
124- mul PF_OFFS, PF_OFFS, UX
125-
126- .set prefetch_offset, prefetch_distance
127-
128- sub STRIDE, BOTTOM, TOP
129- .unreq BOTTOM
130-
131- cmp WIDTH, #0
132- ble 3f
133-
134- vdup.u16 q12, X
135- vdup.u16 q13, UX
136- vdup.u8 d28, WT
137- vdup.u8 d29, WB
138- vadd.u16 d25, d25, d26
139- vadd.u16 q13, q13, q13
140- vshr.u16 q15, q12, #8
141- vadd.u16 q12, q12, q13
142-
143- subs WIDTH, WIDTH, #4
144- blt 1f
145- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
146-0:
147- bilinear_interpolate_four_pixels src_fmt, 8, dst_fmt, op
148- subs WIDTH, WIDTH, #4
149- bge 0b
150-1:
151- tst WIDTH, #2
152- beq 2f
153- bilinear_interpolate_two_pixels src_fmt, 8, dst_fmt, op
154-2:
155- tst WIDTH, #1
156- beq 3f
157- bilinear_interpolate_last_pixel src_fmt, 8, dst_fmt, op
158-3:
159- pop {r4, r5, r6, r7, r8, r9, r10, ip}
160- bx lr
161-
162- .unreq OUT
163- .unreq TOP
164- .unreq WT
165- .unreq WB
166- .unreq X
167- .unreq UX
168- .unreq WIDTH
169- .unreq MASK
170- .unreq TMP1
171- .unreq TMP2
172- .unreq PF_OFFS
173- .unreq TMP3
174- .unreq TMP4
175- .unreq STRIDE
176-.endfunc
177-
178-.endm
179-
180-generate_bilinear_scanline_func_src_dst \
181- pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
182- 8888, 8888, over, 2, 28
183-
184-generate_bilinear_scanline_func_src_dst \
185- pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
186- 8888, 8888, add, 2, 28
187-
188-generate_bilinear_scanline_func_src_a8_dst \
189- pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
190- 8888, 8888, src, 2, 28
191-
192-generate_bilinear_scanline_func_src_a8_dst \
193- pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
194- 8888, 0565, src, 2, 28
195-
196-generate_bilinear_scanline_func_src_a8_dst \
197- pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
198- 0565, 8888, src, 1, 28
199-
200-generate_bilinear_scanline_func_src_a8_dst \
201- pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
202- 0565, 0565, src, 1, 28
203-
204-generate_bilinear_scanline_func_src_a8_dst \
205- pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
206- 8888, 8888, over, 2, 28
207-
208-generate_bilinear_scanline_func_src_a8_dst \
209- pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
210- 8888, 8888, add, 2, 28
211-
212 .set BILINEAR_FLAG_USE_MASK, 1
213 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
214
215@@ -855,6 +663,8 @@ pixman_asm_function fname
216 TMP4 .req r10
217 STRIDE .req r3
218
219+ .set prefetch_offset, prefetch_distance
220+
221 mov ip, sp
222 push {r4, r5, r6, r7, r8, r9, r10, ip}
223 mov PF_OFFS, #prefetch_distance
224@@ -968,3 +778,293 @@ pixman_asm_function fname
225 .endfunc
226
227 .endm
228+
229+/* src_8888_8_8888 */
230+.macro bilinear_src_8888_8_8888_process_last_pixel
231+ bilinear_interpolate_last_pixel 8888, 8, 8888, src
232+.endm
233+
234+.macro bilinear_src_8888_8_8888_process_two_pixels
235+ bilinear_interpolate_two_pixels 8888, 8, 8888, src
236+.endm
237+
238+.macro bilinear_src_8888_8_8888_process_four_pixels
239+ bilinear_interpolate_four_pixels 8888, 8, 8888, src
240+.endm
241+
242+.macro bilinear_src_8888_8_8888_process_pixblock_head
243+ bilinear_src_8888_8_8888_process_four_pixels
244+.endm
245+
246+.macro bilinear_src_8888_8_8888_process_pixblock_tail
247+.endm
248+
249+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
250+ bilinear_src_8888_8_8888_process_pixblock_tail
251+ bilinear_src_8888_8_8888_process_pixblock_head
252+.endm
253+
254+/* src_8888_8_0565 */
255+.macro bilinear_src_8888_8_0565_process_last_pixel
256+ bilinear_interpolate_last_pixel 8888, 8, 0565, src
257+.endm
258+
259+.macro bilinear_src_8888_8_0565_process_two_pixels
260+ bilinear_interpolate_two_pixels 8888, 8, 0565, src
261+.endm
262+
263+.macro bilinear_src_8888_8_0565_process_four_pixels
264+ bilinear_interpolate_four_pixels 8888, 8, 0565, src
265+.endm
266+
267+.macro bilinear_src_8888_8_0565_process_pixblock_head
268+ bilinear_src_8888_8_0565_process_four_pixels
269+.endm
270+
271+.macro bilinear_src_8888_8_0565_process_pixblock_tail
272+.endm
273+
274+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
275+ bilinear_src_8888_8_0565_process_pixblock_tail
276+ bilinear_src_8888_8_0565_process_pixblock_head
277+.endm
278+
279+/* src_0565_8_x888 */
280+.macro bilinear_src_0565_8_x888_process_last_pixel
281+ bilinear_interpolate_last_pixel 0565, 8, 8888, src
282+.endm
283+
284+.macro bilinear_src_0565_8_x888_process_two_pixels
285+ bilinear_interpolate_two_pixels 0565, 8, 8888, src
286+.endm
287+
288+.macro bilinear_src_0565_8_x888_process_four_pixels
289+ bilinear_interpolate_four_pixels 0565, 8, 8888, src
290+.endm
291+
292+.macro bilinear_src_0565_8_x888_process_pixblock_head
293+ bilinear_src_0565_8_x888_process_four_pixels
294+.endm
295+
296+.macro bilinear_src_0565_8_x888_process_pixblock_tail
297+.endm
298+
299+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
300+ bilinear_src_0565_8_x888_process_pixblock_tail
301+ bilinear_src_0565_8_x888_process_pixblock_head
302+.endm
303+
304+/* src_0565_8_0565 */
305+.macro bilinear_src_0565_8_0565_process_last_pixel
306+ bilinear_interpolate_last_pixel 0565, 8, 0565, src
307+.endm
308+
309+.macro bilinear_src_0565_8_0565_process_two_pixels
310+ bilinear_interpolate_two_pixels 0565, 8, 0565, src
311+.endm
312+
313+.macro bilinear_src_0565_8_0565_process_four_pixels
314+ bilinear_interpolate_four_pixels 0565, 8, 0565, src
315+.endm
316+
317+.macro bilinear_src_0565_8_0565_process_pixblock_head
318+ bilinear_src_0565_8_0565_process_four_pixels
319+.endm
320+
321+.macro bilinear_src_0565_8_0565_process_pixblock_tail
322+.endm
323+
324+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
325+ bilinear_src_0565_8_0565_process_pixblock_tail
326+ bilinear_src_0565_8_0565_process_pixblock_head
327+.endm
328+
329+/* over_8888_8888 */
330+.macro bilinear_over_8888_8888_process_last_pixel
331+ bilinear_interpolate_last_pixel 8888, x, 8888, over
332+.endm
333+
334+.macro bilinear_over_8888_8888_process_two_pixels
335+ bilinear_interpolate_two_pixels 8888, x, 8888, over
336+.endm
337+
338+.macro bilinear_over_8888_8888_process_four_pixels
339+ bilinear_interpolate_four_pixels 8888, x, 8888, over
340+.endm
341+
342+.macro bilinear_over_8888_8888_process_pixblock_head
343+ bilinear_over_8888_8888_process_four_pixels
344+.endm
345+
346+.macro bilinear_over_8888_8888_process_pixblock_tail
347+.endm
348+
349+.macro bilinear_over_8888_8888_process_pixblock_tail_head
350+ bilinear_over_8888_8888_process_pixblock_tail
351+ bilinear_over_8888_8888_process_pixblock_head
352+.endm
353+
354+/* over_8888_8_8888 */
355+.macro bilinear_over_8888_8_8888_process_last_pixel
356+ bilinear_interpolate_last_pixel 8888, 8, 8888, over
357+.endm
358+
359+.macro bilinear_over_8888_8_8888_process_two_pixels
360+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
361+.endm
362+
363+.macro bilinear_over_8888_8_8888_process_four_pixels
364+ bilinear_interpolate_four_pixels 8888, 8, 8888, over
365+.endm
366+
367+.macro bilinear_over_8888_8_8888_process_pixblock_head
368+ bilinear_over_8888_8_8888_process_four_pixels
369+.endm
370+
371+.macro bilinear_over_8888_8_8888_process_pixblock_tail
372+.endm
373+
374+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
375+ bilinear_over_8888_8_8888_process_pixblock_tail
376+ bilinear_over_8888_8_8888_process_pixblock_head
377+.endm
378+
379+/* add_8888_8888 */
380+.macro bilinear_add_8888_8888_process_last_pixel
381+ bilinear_interpolate_last_pixel 8888, x, 8888, add
382+.endm
383+
384+.macro bilinear_add_8888_8888_process_two_pixels
385+ bilinear_interpolate_two_pixels 8888, x, 8888, add
386+.endm
387+
388+.macro bilinear_add_8888_8888_process_four_pixels
389+ bilinear_interpolate_four_pixels 8888, x, 8888, add
390+.endm
391+
392+.macro bilinear_add_8888_8888_process_pixblock_head
393+ bilinear_add_8888_8888_process_four_pixels
394+.endm
395+
396+.macro bilinear_add_8888_8888_process_pixblock_tail
397+.endm
398+
399+.macro bilinear_add_8888_8888_process_pixblock_tail_head
400+ bilinear_add_8888_8888_process_pixblock_tail
401+ bilinear_add_8888_8888_process_pixblock_head
402+.endm
403+
404+/* add_8888_8_8888 */
405+.macro bilinear_add_8888_8_8888_process_last_pixel
406+ bilinear_interpolate_last_pixel 8888, 8, 8888, add
407+.endm
408+
409+.macro bilinear_add_8888_8_8888_process_two_pixels
410+ bilinear_interpolate_two_pixels 8888, 8, 8888, add
411+.endm
412+
413+.macro bilinear_add_8888_8_8888_process_four_pixels
414+ bilinear_interpolate_four_pixels 8888, 8, 8888, add
415+.endm
416+
417+.macro bilinear_add_8888_8_8888_process_pixblock_head
418+ bilinear_add_8888_8_8888_process_four_pixels
419+.endm
420+
421+.macro bilinear_add_8888_8_8888_process_pixblock_tail
422+.endm
423+
424+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
425+ bilinear_add_8888_8_8888_process_pixblock_tail
426+ bilinear_add_8888_8_8888_process_pixblock_head
427+.endm
428+
429+
430+/* Bilinear scanline functions */
431+generate_bilinear_scanline_func \
432+ pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
433+ 8888, 8888, 2, 2, \
434+ bilinear_src_8888_8_8888_process_last_pixel, \
435+ bilinear_src_8888_8_8888_process_two_pixels, \
436+ bilinear_src_8888_8_8888_process_four_pixels, \
437+ bilinear_src_8888_8_8888_process_pixblock_head, \
438+ bilinear_src_8888_8_8888_process_pixblock_tail, \
439+ bilinear_src_8888_8_8888_process_pixblock_tail_head, \
440+ 4, 28, BILINEAR_FLAG_USE_MASK
441+
442+generate_bilinear_scanline_func \
443+ pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
444+ 8888, 0565, 2, 1, \
445+ bilinear_src_8888_8_0565_process_last_pixel, \
446+ bilinear_src_8888_8_0565_process_two_pixels, \
447+ bilinear_src_8888_8_0565_process_four_pixels, \
448+ bilinear_src_8888_8_0565_process_pixblock_head, \
449+ bilinear_src_8888_8_0565_process_pixblock_tail, \
450+ bilinear_src_8888_8_0565_process_pixblock_tail_head, \
451+ 4, 28, BILINEAR_FLAG_USE_MASK
452+
453+generate_bilinear_scanline_func \
454+ pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
455+ 0565, 8888, 1, 2, \
456+ bilinear_src_0565_8_x888_process_last_pixel, \
457+ bilinear_src_0565_8_x888_process_two_pixels, \
458+ bilinear_src_0565_8_x888_process_four_pixels, \
459+ bilinear_src_0565_8_x888_process_pixblock_head, \
460+ bilinear_src_0565_8_x888_process_pixblock_tail, \
461+ bilinear_src_0565_8_x888_process_pixblock_tail_head, \
462+ 4, 28, BILINEAR_FLAG_USE_MASK
463+
464+generate_bilinear_scanline_func \
465+ pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
466+ 0565, 0565, 1, 1, \
467+ bilinear_src_0565_8_0565_process_last_pixel, \
468+ bilinear_src_0565_8_0565_process_two_pixels, \
469+ bilinear_src_0565_8_0565_process_four_pixels, \
470+ bilinear_src_0565_8_0565_process_pixblock_head, \
471+ bilinear_src_0565_8_0565_process_pixblock_tail, \
472+ bilinear_src_0565_8_0565_process_pixblock_tail_head, \
473+ 4, 28, BILINEAR_FLAG_USE_MASK
474+
475+generate_bilinear_scanline_func \
476+ pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
477+ 8888, 8888, 2, 2, \
478+ bilinear_over_8888_8888_process_last_pixel, \
479+ bilinear_over_8888_8888_process_two_pixels, \
480+ bilinear_over_8888_8888_process_four_pixels, \
481+ bilinear_over_8888_8888_process_pixblock_head, \
482+ bilinear_over_8888_8888_process_pixblock_tail, \
483+ bilinear_over_8888_8888_process_pixblock_tail_head, \
484+ 4, 28, 0
485+
486+generate_bilinear_scanline_func \
487+ pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
488+ 8888, 8888, 2, 2, \
489+ bilinear_over_8888_8_8888_process_last_pixel, \
490+ bilinear_over_8888_8_8888_process_two_pixels, \
491+ bilinear_over_8888_8_8888_process_four_pixels, \
492+ bilinear_over_8888_8_8888_process_pixblock_head, \
493+ bilinear_over_8888_8_8888_process_pixblock_tail, \
494+ bilinear_over_8888_8_8888_process_pixblock_tail_head, \
495+ 4, 28, BILINEAR_FLAG_USE_MASK
496+
497+generate_bilinear_scanline_func \
498+ pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
499+ 8888, 8888, 2, 2, \
500+ bilinear_add_8888_8888_process_last_pixel, \
501+ bilinear_add_8888_8888_process_two_pixels, \
502+ bilinear_add_8888_8888_process_four_pixels, \
503+ bilinear_add_8888_8888_process_pixblock_head, \
504+ bilinear_add_8888_8888_process_pixblock_tail, \
505+ bilinear_add_8888_8888_process_pixblock_tail_head, \
506+ 4, 28, 0
507+
508+generate_bilinear_scanline_func \
509+ pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
510+ 8888, 8888, 2, 2, \
511+ bilinear_add_8888_8_8888_process_last_pixel, \
512+ bilinear_add_8888_8_8888_process_two_pixels, \
513+ bilinear_add_8888_8_8888_process_four_pixels, \
514+ bilinear_add_8888_8_8888_process_pixblock_head, \
515+ bilinear_add_8888_8_8888_process_pixblock_tail, \
516+ bilinear_add_8888_8_8888_process_pixblock_tail_head, \
517+ 4, 28, BILINEAR_FLAG_USE_MASK
518--
5191.6.6.1
520
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0006-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0006-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch
new file mode 100644
index 000000000..d982b5ba7
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0006-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch
@@ -0,0 +1,186 @@
1From c8f7edaebd510ba120d74102a93ad4d202b0e806 Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Wed, 21 Sep 2011 15:52:13 +0900
4Subject: [PATCH 6/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8888
5
6Instructions are reordered to eliminate pipeline stalls and get
7better memory access.
8
9Performance of before/after on cortex-a8 @ 1GHz
10
11<< 2000 x 2000 with scale factor close to 1.x >>
12before : 50.43 Mpix/s
13after : 61.09 Mpix/s
14---
15 pixman/pixman-arm-neon-asm-bilinear.S | 149 ++++++++++++++++++++++++++++++++-
16 1 files changed, 146 insertions(+), 3 deletions(-)
17
18diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
19index 25bcb24..76937e0 100644
20--- a/pixman/pixman-arm-neon-asm-bilinear.S
21+++ b/pixman/pixman-arm-neon-asm-bilinear.S
22@@ -893,15 +893,158 @@ pixman_asm_function fname
23 .endm
24
25 .macro bilinear_over_8888_8888_process_pixblock_head
26- bilinear_over_8888_8888_process_four_pixels
27+ mov TMP1, X, asr #16
28+ add X, X, UX
29+ add TMP1, TOP, TMP1, asl #2
30+ mov TMP2, X, asr #16
31+ add X, X, UX
32+ add TMP2, TOP, TMP2, asl #2
33+
34+ vld1.32 {d22}, [TMP1], STRIDE
35+ vld1.32 {d23}, [TMP1]
36+ mov TMP3, X, asr #16
37+ add X, X, UX
38+ add TMP3, TOP, TMP3, asl #2
39+ vmull.u8 q8, d22, d28
40+ vmlal.u8 q8, d23, d29
41+
42+ vld1.32 {d22}, [TMP2], STRIDE
43+ vld1.32 {d23}, [TMP2]
44+ mov TMP4, X, asr #16
45+ add X, X, UX
46+ add TMP4, TOP, TMP4, asl #2
47+ vmull.u8 q9, d22, d28
48+ vmlal.u8 q9, d23, d29
49+
50+ vld1.32 {d22}, [TMP3], STRIDE
51+ vld1.32 {d23}, [TMP3]
52+ vmull.u8 q10, d22, d28
53+ vmlal.u8 q10, d23, d29
54+
55+ vshll.u16 q0, d16, #8
56+ vmlsl.u16 q0, d16, d30
57+ vmlal.u16 q0, d17, d30
58+
59+ pld [TMP4, PF_OFFS]
60+ vld1.32 {d16}, [TMP4], STRIDE
61+ vld1.32 {d17}, [TMP4]
62+ pld [TMP4, PF_OFFS]
63+ vmull.u8 q11, d16, d28
64+ vmlal.u8 q11, d17, d29
65+
66+ vshll.u16 q1, d18, #8
67+ vmlsl.u16 q1, d18, d31
68+ vmlal.u16 q1, d19, d31
69+ vshr.u16 q15, q12, #8
70+ vadd.u16 q12, q12, q13
71 .endm
72
73 .macro bilinear_over_8888_8888_process_pixblock_tail
74+ vshll.u16 q2, d20, #8
75+ vmlsl.u16 q2, d20, d30
76+ vmlal.u16 q2, d21, d30
77+ vshll.u16 q3, d22, #8
78+ vmlsl.u16 q3, d22, d31
79+ vmlal.u16 q3, d23, d31
80+ vshrn.u32 d0, q0, #16
81+ vshrn.u32 d1, q1, #16
82+ vld1.32 {d2, d3}, [OUT, :128]
83+ pld [OUT, PF_OFFS]
84+ vshrn.u32 d4, q2, #16
85+ vshr.u16 q15, q12, #8
86+ vshrn.u32 d5, q3, #16
87+ vmovn.u16 d6, q0
88+ vmovn.u16 d7, q2
89+ vuzp.8 d6, d7
90+ vuzp.8 d2, d3
91+ vuzp.8 d6, d7
92+ vuzp.8 d2, d3
93+ vdup.32 d4, d7[1]
94+ vmvn.8 d4, d4
95+ vmull.u8 q11, d2, d4
96+ vmull.u8 q2, d3, d4
97+ vrshr.u16 q1, q11, #8
98+ vrshr.u16 q10, q2, #8
99+ vraddhn.u16 d2, q1, q11
100+ vraddhn.u16 d3, q10, q2
101+ vqadd.u8 q3, q1, q3
102+ vuzp.8 d6, d7
103+ vuzp.8 d6, d7
104+ vadd.u16 q12, q12, q13
105+ vst1.32 {d6, d7}, [OUT, :128]!
106 .endm
107
108 .macro bilinear_over_8888_8888_process_pixblock_tail_head
109- bilinear_over_8888_8888_process_pixblock_tail
110- bilinear_over_8888_8888_process_pixblock_head
111+ vshll.u16 q2, d20, #8
112+ mov TMP1, X, asr #16
113+ add X, X, UX
114+ add TMP1, TOP, TMP1, asl #2
115+ vmlsl.u16 q2, d20, d30
116+ mov TMP2, X, asr #16
117+ add X, X, UX
118+ add TMP2, TOP, TMP2, asl #2
119+ vmlal.u16 q2, d21, d30
120+ vshll.u16 q3, d22, #8
121+ vld1.32 {d20}, [TMP1], STRIDE
122+ vmlsl.u16 q3, d22, d31
123+ vmlal.u16 q3, d23, d31
124+ vld1.32 {d21}, [TMP1]
125+ vmull.u8 q8, d20, d28
126+ vmlal.u8 q8, d21, d29
127+ vshrn.u32 d0, q0, #16
128+ vshrn.u32 d1, q1, #16
129+ vld1.32 {d2, d3}, [OUT, :128]
130+ pld [OUT, PF_OFFS]
131+ vshrn.u32 d4, q2, #16
132+ vshr.u16 q15, q12, #8
133+ vld1.32 {d22}, [TMP2], STRIDE
134+ vshrn.u32 d5, q3, #16
135+ vmovn.u16 d6, q0
136+ vld1.32 {d23}, [TMP2]
137+ vmull.u8 q9, d22, d28
138+ mov TMP3, X, asr #16
139+ add X, X, UX
140+ add TMP3, TOP, TMP3, asl #2
141+ mov TMP4, X, asr #16
142+ add X, X, UX
143+ add TMP4, TOP, TMP4, asl #2
144+ vmlal.u8 q9, d23, d29
145+ vmovn.u16 d7, q2
146+ vld1.32 {d22}, [TMP3], STRIDE
147+ vuzp.8 d6, d7
148+ vuzp.8 d2, d3
149+ vuzp.8 d6, d7
150+ vuzp.8 d2, d3
151+ vdup.32 d4, d7[1]
152+ vld1.32 {d23}, [TMP3]
153+ vmvn.8 d4, d4
154+ vmull.u8 q10, d22, d28
155+ vmlal.u8 q10, d23, d29
156+ vmull.u8 q11, d2, d4
157+ vmull.u8 q2, d3, d4
158+ vshll.u16 q0, d16, #8
159+ vmlsl.u16 q0, d16, d30
160+ vrshr.u16 q1, q11, #8
161+ vmlal.u16 q0, d17, d30
162+ vrshr.u16 q8, q2, #8
163+ vraddhn.u16 d2, q1, q11
164+ vraddhn.u16 d3, q8, q2
165+ pld [TMP4, PF_OFFS]
166+ vld1.32 {d16}, [TMP4], STRIDE
167+ vqadd.u8 q3, q1, q3
168+ vld1.32 {d17}, [TMP4]
169+ pld [TMP4, PF_OFFS]
170+ vmull.u8 q11, d16, d28
171+ vmlal.u8 q11, d17, d29
172+ vuzp.8 d6, d7
173+ vshll.u16 q1, d18, #8
174+ vuzp.8 d6, d7
175+ vmlsl.u16 q1, d18, d31
176+ vadd.u16 q12, q12, q13
177+ vmlal.u16 q1, d19, d31
178+ vshr.u16 q15, q12, #8
179+ vadd.u16 q12, q12, q13
180+ vst1.32 {d6, d7}, [OUT, :128]!
181 .endm
182
183 /* over_8888_8_8888 */
184--
1851.6.6.1
186
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch
new file mode 100644
index 000000000..e4e741f90
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch
@@ -0,0 +1,206 @@
1From 94585f9a618821a5c06c3a497902579b4a08b05f Mon Sep 17 00:00:00 2001
2From: Taekyun Kim <tkq.kim@samsung.com>
3Date: Mon, 26 Sep 2011 19:04:53 +0900
4Subject: [PATCH 7/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
5
6Instructions are reordered to eliminate pipeline stalls and get
7better memory access.
8
9Performance of before/after on cortex-a8 @ 1GHz
10
11<< 2000 x 2000 with scale factor close to 1.x >>
12before : 40.53 Mpix/s
13after : 50.76 Mpix/s
14---
15 pixman/pixman-arm-neon-asm-bilinear.S | 162 ++++++++++++++++++++++++++++++++-
16 1 files changed, 158 insertions(+), 4 deletions(-)
17
18diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
19index 76937e0..4ab46e1 100644
20--- a/pixman/pixman-arm-neon-asm-bilinear.S
21+++ b/pixman/pixman-arm-neon-asm-bilinear.S
22@@ -949,7 +949,7 @@ pixman_asm_function fname
23 vshrn.u32 d0, q0, #16
24 vshrn.u32 d1, q1, #16
25 vld1.32 {d2, d3}, [OUT, :128]
26- pld [OUT, PF_OFFS]
27+ pld [OUT, #(prefetch_offset * 4)]
28 vshrn.u32 d4, q2, #16
29 vshr.u16 q15, q12, #8
30 vshrn.u32 d5, q3, #16
31@@ -1061,15 +1061,169 @@ pixman_asm_function fname
32 .endm
33
34 .macro bilinear_over_8888_8_8888_process_pixblock_head
35- bilinear_over_8888_8_8888_process_four_pixels
36+ mov TMP1, X, asr #16
37+ add X, X, UX
38+ add TMP1, TOP, TMP1, asl #2
39+ vld1.32 {d0}, [TMP1], STRIDE
40+ mov TMP2, X, asr #16
41+ add X, X, UX
42+ add TMP2, TOP, TMP2, asl #2
43+ vld1.32 {d1}, [TMP1]
44+ mov TMP3, X, asr #16
45+ add X, X, UX
46+ add TMP3, TOP, TMP3, asl #2
47+ vld1.32 {d2}, [TMP2], STRIDE
48+ mov TMP4, X, asr #16
49+ add X, X, UX
50+ add TMP4, TOP, TMP4, asl #2
51+ vld1.32 {d3}, [TMP2]
52+ vmull.u8 q2, d0, d28
53+ vmull.u8 q3, d2, d28
54+ vmlal.u8 q2, d1, d29
55+ vmlal.u8 q3, d3, d29
56+ vshll.u16 q0, d4, #8
57+ vshll.u16 q1, d6, #8
58+ vmlsl.u16 q0, d4, d30
59+ vmlsl.u16 q1, d6, d31
60+ vmlal.u16 q0, d5, d30
61+ vmlal.u16 q1, d7, d31
62+ vshrn.u32 d0, q0, #16
63+ vshrn.u32 d1, q1, #16
64+ vld1.32 {d2}, [TMP3], STRIDE
65+ vld1.32 {d3}, [TMP3]
66+ pld [TMP4, PF_OFFS]
67+ vld1.32 {d4}, [TMP4], STRIDE
68+ vld1.32 {d5}, [TMP4]
69+ pld [TMP4, PF_OFFS]
70+ vmull.u8 q3, d2, d28
71+ vmlal.u8 q3, d3, d29
72+ vmull.u8 q1, d4, d28
73+ vmlal.u8 q1, d5, d29
74+ vshr.u16 q15, q12, #8
75+ vld1.32 {d22[0]}, [MASK]!
76+ pld [MASK, #prefetch_offset]
77+ vadd.u16 q12, q12, q13
78+ vmovn.u16 d16, q0
79 .endm
80
81 .macro bilinear_over_8888_8_8888_process_pixblock_tail
82+ vshll.u16 q9, d6, #8
83+ vshll.u16 q10, d2, #8
84+ vmlsl.u16 q9, d6, d30
85+ vmlsl.u16 q10, d2, d31
86+ vmlal.u16 q9, d7, d30
87+ vmlal.u16 q10, d3, d31
88+ vshr.u16 q15, q12, #8
89+ vadd.u16 q12, q12, q13
90+ vdup.32 d22, d22[0]
91+ vshrn.u32 d18, q9, #16
92+ vshrn.u32 d19, q10, #16
93+ vmovn.u16 d17, q9
94+ vld1.32 {d18, d19}, [OUT, :128]
95+ pld [OUT, PF_OFFS]
96+ vuzp.8 d16, d17
97+ vuzp.8 d18, d19
98+ vuzp.8 d16, d17
99+ vuzp.8 d18, d19
100+ vmull.u8 q10, d16, d22
101+ vmull.u8 q11, d17, d22
102+ vrsra.u16 q10, q10, #8
103+ vrsra.u16 q11, q11, #8
104+ vrshrn.u16 d16, q10, #8
105+ vrshrn.u16 d17, q11, #8
106+ vdup.32 d22, d17[1]
107+ vmvn.8 d22, d22
108+ vmull.u8 q10, d18, d22
109+ vmull.u8 q11, d19, d22
110+ vrshr.u16 q9, q10, #8
111+ vrshr.u16 q0, q11, #8
112+ vraddhn.u16 d18, q9, q10
113+ vraddhn.u16 d19, q0, q11
114+ vqadd.u8 q9, q8, q9
115+ vuzp.8 d18, d19
116+ vuzp.8 d18, d19
117+ vst1.32 {d18, d19}, [OUT, :128]!
118 .endm
119
120 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
121- bilinear_over_8888_8_8888_process_pixblock_tail
122- bilinear_over_8888_8_8888_process_pixblock_head
123+ vshll.u16 q9, d6, #8
124+ mov TMP1, X, asr #16
125+ add X, X, UX
126+ add TMP1, TOP, TMP1, asl #2
127+ vshll.u16 q10, d2, #8
128+ vld1.32 {d0}, [TMP1], STRIDE
129+ mov TMP2, X, asr #16
130+ add X, X, UX
131+ add TMP2, TOP, TMP2, asl #2
132+ vmlsl.u16 q9, d6, d30
133+ vmlsl.u16 q10, d2, d31
134+ vld1.32 {d1}, [TMP1]
135+ mov TMP3, X, asr #16
136+ add X, X, UX
137+ add TMP3, TOP, TMP3, asl #2
138+ vmlal.u16 q9, d7, d30
139+ vmlal.u16 q10, d3, d31
140+ vld1.32 {d2}, [TMP2], STRIDE
141+ mov TMP4, X, asr #16
142+ add X, X, UX
143+ add TMP4, TOP, TMP4, asl #2
144+ vshr.u16 q15, q12, #8
145+ vadd.u16 q12, q12, q13
146+ vld1.32 {d3}, [TMP2]
147+ vdup.32 d22, d22[0]
148+ vshrn.u32 d18, q9, #16
149+ vshrn.u32 d19, q10, #16
150+ vmull.u8 q2, d0, d28
151+ vmull.u8 q3, d2, d28
152+ vmovn.u16 d17, q9
153+ vld1.32 {d18, d19}, [OUT, :128]
154+ pld [OUT, #(prefetch_offset * 4)]
155+ vmlal.u8 q2, d1, d29
156+ vmlal.u8 q3, d3, d29
157+ vuzp.8 d16, d17
158+ vuzp.8 d18, d19
159+ vshll.u16 q0, d4, #8
160+ vshll.u16 q1, d6, #8
161+ vuzp.8 d16, d17
162+ vuzp.8 d18, d19
163+ vmlsl.u16 q0, d4, d30
164+ vmlsl.u16 q1, d6, d31
165+ vmull.u8 q10, d16, d22
166+ vmull.u8 q11, d17, d22
167+ vmlal.u16 q0, d5, d30
168+ vmlal.u16 q1, d7, d31
169+ vrsra.u16 q10, q10, #8
170+ vrsra.u16 q11, q11, #8
171+ vshrn.u32 d0, q0, #16
172+ vshrn.u32 d1, q1, #16
173+ vrshrn.u16 d16, q10, #8
174+ vrshrn.u16 d17, q11, #8
175+ vld1.32 {d2}, [TMP3], STRIDE
176+ vdup.32 d22, d17[1]
177+ vld1.32 {d3}, [TMP3]
178+ vmvn.8 d22, d22
179+ pld [TMP4, PF_OFFS]
180+ vld1.32 {d4}, [TMP4], STRIDE
181+ vmull.u8 q10, d18, d22
182+ vmull.u8 q11, d19, d22
183+ vld1.32 {d5}, [TMP4]
184+ pld [TMP4, PF_OFFS]
185+ vmull.u8 q3, d2, d28
186+ vrshr.u16 q9, q10, #8
187+ vrshr.u16 q15, q11, #8
188+ vmlal.u8 q3, d3, d29
189+ vmull.u8 q1, d4, d28
190+ vraddhn.u16 d18, q9, q10
191+ vraddhn.u16 d19, q15, q11
192+ vmlal.u8 q1, d5, d29
193+ vshr.u16 q15, q12, #8
194+ vqadd.u8 q9, q8, q9
195+ vld1.32 {d22[0]}, [MASK]!
196+ vuzp.8 d18, d19
197+ vadd.u16 q12, q12, q13
198+ vuzp.8 d18, d19
199+ vmovn.u16 d16, q0
200+ vst1.32 {d18, d19}, [OUT, :128]!
201 .endm
202
203 /* add_8888_8888 */
204--
2051.6.6.1
206
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0008-Generic-C-implementation-of-pixman_blt-with-overlapp.patch b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0008-Generic-C-implementation-of-pixman_blt-with-overlapp.patch
new file mode 100644
index 000000000..903f1f7f1
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman-0.23.6/0008-Generic-C-implementation-of-pixman_blt-with-overlapp.patch
@@ -0,0 +1,114 @@
1From d65a08904857d87dcd74b87681c9b94390b76eff Mon Sep 17 00:00:00 2001
2From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
3Date: Tue, 16 Mar 2010 16:55:28 +0100
4Subject: [PATCH 8/8] Generic C implementation of pixman_blt with overlapping support
5
6Uses memcpy/memmove functions to copy pixels, can handle the
7case when both source and destination areas are in the same
8image (this is useful for scrolling).
9
10It is assumed that copying direction is only important when
11using the same image for both source and destination (and
12src_stride == dst_stride). Copying direction is undefined
13for the images with different source and destination stride
14which happen to be in the overlapped areas (but this is an
15unrealistic case anyway).
16---
17 pixman/pixman-general.c | 21 ++++++++++++++++++---
18 pixman/pixman-private.h | 43 +++++++++++++++++++++++++++++++++++++++++++
19 2 files changed, 61 insertions(+), 3 deletions(-)
20
21diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
22index 2ccdfcd..6f7bb34 100644
23--- a/pixman/pixman-general.c
24+++ b/pixman/pixman-general.c
25@@ -227,9 +227,24 @@ general_blt (pixman_implementation_t *imp,
26 int width,
27 int height)
28 {
29- /* We can't blit unless we have sse2 or mmx */
30-
31- return FALSE;
32+ uint8_t *dst_bytes = (uint8_t *)dst_bits;
33+ uint8_t *src_bytes = (uint8_t *)src_bits;
34+ int bpp;
35+
36+ if (src_bpp != dst_bpp || src_bpp & 7)
37+ return FALSE;
38+
39+ bpp = src_bpp >> 3;
40+ width *= bpp;
41+ src_stride *= 4;
42+ dst_stride *= 4;
43+ pixman_blt_helper (src_bytes + src_y * src_stride + src_x * bpp,
44+ dst_bytes + dest_y * dst_stride + dest_x * bpp,
45+ src_stride,
46+ dst_stride,
47+ width,
48+ height);
49+ return TRUE;
50 }
51
52 static pixman_bool_t
53diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
54index cbd48f3..c20d9f0 100644
55--- a/pixman/pixman-private.h
56+++ b/pixman/pixman-private.h
57@@ -10,6 +10,7 @@
58
59 #include "pixman.h"
60 #include <time.h>
61+#include <string.h>
62 #include <assert.h>
63 #include <stdio.h>
64 #include <string.h>
65@@ -998,4 +999,46 @@ void pixman_timer_register (pixman_timer_t *timer);
66
67 #endif /* PIXMAN_TIMERS */
68
69+/* a helper function, can blit 8-bit images with src/dst overlapping support */
70+static inline void
71+pixman_blt_helper (uint8_t *src_bytes,
72+ uint8_t *dst_bytes,
73+ int src_stride,
74+ int dst_stride,
75+ int width,
76+ int height)
77+{
78+ /*
79+ * The second part of this check is not strictly needed, but it prevents
80+ * unnecessary upside-down processing of areas which belong to different
81+ * images. Upside-down processing can be slower with fixed-distance-ahead
82+ * prefetch and perceived as having more tearing.
83+ */
84+ if (src_bytes < dst_bytes + width &&
85+ src_bytes + src_stride * height > dst_bytes)
86+ {
87+ src_bytes += src_stride * height - src_stride;
88+ dst_bytes += dst_stride * height - dst_stride;
89+ dst_stride = -dst_stride;
90+ src_stride = -src_stride;
91+ /* Horizontal scrolling to the left needs memmove */
92+ if (src_bytes + width > dst_bytes)
93+ {
94+ while (--height >= 0)
95+ {
96+ memmove (dst_bytes, src_bytes, width);
97+ dst_bytes += dst_stride;
98+ src_bytes += src_stride;
99+ }
100+ return;
101+ }
102+ }
103+ while (--height >= 0)
104+ {
105+ memcpy (dst_bytes, src_bytes, width);
106+ dst_bytes += dst_stride;
107+ src_bytes += src_stride;
108+ }
109+}
110+
111 #endif /* PIXMAN_PRIVATE_H */
112--
1131.6.6.1
114
diff --git a/meta-oe/recipes-graphics/xorg-lib/pixman_0.23.6.bb b/meta-oe/recipes-graphics/xorg-lib/pixman_0.23.6.bb
new file mode 100644
index 000000000..bbdffef1d
--- /dev/null
+++ b/meta-oe/recipes-graphics/xorg-lib/pixman_0.23.6.bb
@@ -0,0 +1,29 @@
1require pixman.inc
2
3# Some artefacts observed in webkit scrolling, need to see if it's a regression or not
4DEFAULT_PREFERENCE = "-1"
5
6LICENSE = "MIT & MIT-style & Public Domain"
7LIC_FILES_CHKSUM = "file://COPYING;md5=14096c769ae0cbb5fcb94ec468be11b3\
8 file://pixman/pixman-matrix.c;endline=25;md5=ba6e8769bfaaee2c41698755af04c4be \
9 file://pixman/pixman-arm-neon-asm.h;endline=24;md5=9a9cc1e51abbf1da58f4d9528ec9d49b \
10 "
11
12PR = "${INC_PR}.0"
13
14SRC_URI = "http://xorg.freedesktop.org/archive/individual/lib/${BPN}-${PV}.tar.gz \
15 file://0003-ARM-NEON-Some-cleanup-of-bilinear-scanline-functions.patch \
16 file://0004-ARM-NEON-Bilinear-macro-template-for-instruction-sch.patch \
17 file://0005-ARM-NEON-Replace-old-bilinear-scanline-generator-wit.patch \
18 file://0006-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch \
19 file://0007-ARM-NEON-Instruction-scheduling-of-bilinear-over_888.patch \
20 file://0008-Generic-C-implementation-of-pixman_blt-with-overlapp.patch \
21"
22
23SRC_URI[md5sum] = "27eb7a0ec440c89cccd7c396c3581041"
24SRC_URI[sha256sum] = "4e35f49474e78a9430d93caaaea8bbf7e30b65f0da33c31f15a988c25a3ac369"
25
26NEON = " --disable-arm-neon "
27NEON_armv7a = " "
28
29EXTRA_OECONF = "${NEON} --disable-gtk"