Libav
rgb2rgb_template.c
Go to the documentation of this file.
1 /*
2  * software RGB to RGB converter
3  * pluralize by software PAL8 to RGB converter
4  * software YUV to YUV converter
5  * software YUV to RGB converter
6  * Written by Nick Kurshev.
7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8  * lot of big-endian byte order fixes by Alex Beregszaszi
9  *
10  * This file is part of Libav.
11  *
12  * Libav is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * Libav is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with Libav; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 #include "libavutil/attributes.h"
31 #include "libavutil/x86/asm.h"
32 
33 #undef PREFETCH
34 #undef MOVNTQ
35 #undef EMMS
36 #undef SFENCE
37 #undef PAVGB
38 
39 #if COMPILE_TEMPLATE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PAVGB "pavgusb"
42 #elif COMPILE_TEMPLATE_MMXEXT
43 #define PREFETCH "prefetchnta"
44 #define PAVGB "pavgb"
45 #else
46 #define PREFETCH " # nop"
47 #endif
48 
49 #if COMPILE_TEMPLATE_AMD3DNOW
50 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
51 #define EMMS "femms"
52 #else
53 #define EMMS "emms"
54 #endif
55 
56 #if COMPILE_TEMPLATE_MMXEXT
57 #define MOVNTQ "movntq"
58 #define SFENCE "sfence"
59 #else
60 #define MOVNTQ "movq"
61 #define SFENCE " # nop"
62 #endif
63 
64 #if !COMPILE_TEMPLATE_SSE2
65 
66 #if !COMPILE_TEMPLATE_AMD3DNOW
67 
68 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
69 {
70  uint8_t *dest = dst;
71  const uint8_t *s = src;
72  const uint8_t *end;
73  const uint8_t *mm_end;
74  end = s + src_size;
75  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
76  mm_end = end - 23;
77  __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
78  while (s < mm_end) {
79  __asm__ volatile(
80  PREFETCH" 32(%1) \n\t"
81  "movd (%1), %%mm0 \n\t"
82  "punpckldq 3(%1), %%mm0 \n\t"
83  "movd 6(%1), %%mm1 \n\t"
84  "punpckldq 9(%1), %%mm1 \n\t"
85  "movd 12(%1), %%mm2 \n\t"
86  "punpckldq 15(%1), %%mm2 \n\t"
87  "movd 18(%1), %%mm3 \n\t"
88  "punpckldq 21(%1), %%mm3 \n\t"
89  "por %%mm7, %%mm0 \n\t"
90  "por %%mm7, %%mm1 \n\t"
91  "por %%mm7, %%mm2 \n\t"
92  "por %%mm7, %%mm3 \n\t"
93  MOVNTQ" %%mm0, (%0) \n\t"
94  MOVNTQ" %%mm1, 8(%0) \n\t"
95  MOVNTQ" %%mm2, 16(%0) \n\t"
96  MOVNTQ" %%mm3, 24(%0)"
97  :: "r"(dest), "r"(s)
98  :"memory");
99  dest += 32;
100  s += 24;
101  }
102  __asm__ volatile(SFENCE:::"memory");
103  __asm__ volatile(EMMS:::"memory");
104  while (s < end) {
105  *dest++ = *s++;
106  *dest++ = *s++;
107  *dest++ = *s++;
108  *dest++ = 255;
109  }
110 }
111 
112 #define STORE_BGR24_MMX \
113  "psrlq $8, %%mm2 \n\t" \
114  "psrlq $8, %%mm3 \n\t" \
115  "psrlq $8, %%mm6 \n\t" \
116  "psrlq $8, %%mm7 \n\t" \
117  "pand "MANGLE(mask24l)", %%mm0\n\t" \
118  "pand "MANGLE(mask24l)", %%mm1\n\t" \
119  "pand "MANGLE(mask24l)", %%mm4\n\t" \
120  "pand "MANGLE(mask24l)", %%mm5\n\t" \
121  "pand "MANGLE(mask24h)", %%mm2\n\t" \
122  "pand "MANGLE(mask24h)", %%mm3\n\t" \
123  "pand "MANGLE(mask24h)", %%mm6\n\t" \
124  "pand "MANGLE(mask24h)", %%mm7\n\t" \
125  "por %%mm2, %%mm0 \n\t" \
126  "por %%mm3, %%mm1 \n\t" \
127  "por %%mm6, %%mm4 \n\t" \
128  "por %%mm7, %%mm5 \n\t" \
129  \
130  "movq %%mm1, %%mm2 \n\t" \
131  "movq %%mm4, %%mm3 \n\t" \
132  "psllq $48, %%mm2 \n\t" \
133  "psllq $32, %%mm3 \n\t" \
134  "pand "MANGLE(mask24hh)", %%mm2\n\t" \
135  "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
136  "por %%mm2, %%mm0 \n\t" \
137  "psrlq $16, %%mm1 \n\t" \
138  "psrlq $32, %%mm4 \n\t" \
139  "psllq $16, %%mm5 \n\t" \
140  "por %%mm3, %%mm1 \n\t" \
141  "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
142  "por %%mm5, %%mm4 \n\t" \
143  \
144  MOVNTQ" %%mm0, (%0) \n\t" \
145  MOVNTQ" %%mm1, 8(%0) \n\t" \
146  MOVNTQ" %%mm4, 16(%0)"
147 
148 
149 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
150 {
151  uint8_t *dest = dst;
152  const uint8_t *s = src;
153  const uint8_t *end;
154  const uint8_t *mm_end;
155  end = s + src_size;
156  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
157  mm_end = end - 31;
158  while (s < mm_end) {
159  __asm__ volatile(
160  PREFETCH" 32(%1) \n\t"
161  "movq (%1), %%mm0 \n\t"
162  "movq 8(%1), %%mm1 \n\t"
163  "movq 16(%1), %%mm4 \n\t"
164  "movq 24(%1), %%mm5 \n\t"
165  "movq %%mm0, %%mm2 \n\t"
166  "movq %%mm1, %%mm3 \n\t"
167  "movq %%mm4, %%mm6 \n\t"
168  "movq %%mm5, %%mm7 \n\t"
170  :: "r"(dest), "r"(s)
171  :"memory");
172  dest += 24;
173  s += 32;
174  }
175  __asm__ volatile(SFENCE:::"memory");
176  __asm__ volatile(EMMS:::"memory");
177  while (s < end) {
178  *dest++ = *s++;
179  *dest++ = *s++;
180  *dest++ = *s++;
181  s++;
182  }
183 }
184 
185 /*
186  original by Strepto/Astral
187  ported to gcc & bugfixed: A'rpi
188  MMXEXT, 3DNOW optimization by Nick Kurshev
189  32-bit C version, and and&add trick by Michael Niedermayer
190 */
191 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
192 {
193  register const uint8_t* s=src;
194  register uint8_t* d=dst;
195  register const uint8_t *end;
196  const uint8_t *mm_end;
197  end = s + src_size;
198  __asm__ volatile(PREFETCH" %0"::"m"(*s));
199  __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
200  mm_end = end - 15;
201  while (s<mm_end) {
202  __asm__ volatile(
203  PREFETCH" 32(%1) \n\t"
204  "movq (%1), %%mm0 \n\t"
205  "movq 8(%1), %%mm2 \n\t"
206  "movq %%mm0, %%mm1 \n\t"
207  "movq %%mm2, %%mm3 \n\t"
208  "pand %%mm4, %%mm0 \n\t"
209  "pand %%mm4, %%mm2 \n\t"
210  "paddw %%mm1, %%mm0 \n\t"
211  "paddw %%mm3, %%mm2 \n\t"
212  MOVNTQ" %%mm0, (%0) \n\t"
213  MOVNTQ" %%mm2, 8(%0)"
214  :: "r"(d), "r"(s)
215  );
216  d+=16;
217  s+=16;
218  }
219  __asm__ volatile(SFENCE:::"memory");
220  __asm__ volatile(EMMS:::"memory");
221  mm_end = end - 3;
222  while (s < mm_end) {
223  register unsigned x= *((const uint32_t *)s);
224  *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
225  d+=4;
226  s+=4;
227  }
228  if (s < end) {
229  register unsigned short x= *((const uint16_t *)s);
230  *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
231  }
232 }
233 
234 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
235 {
236  register const uint8_t* s=src;
237  register uint8_t* d=dst;
238  register const uint8_t *end;
239  const uint8_t *mm_end;
240  end = s + src_size;
241  __asm__ volatile(PREFETCH" %0"::"m"(*s));
242  __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
243  __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
244  mm_end = end - 15;
245  while (s<mm_end) {
246  __asm__ volatile(
247  PREFETCH" 32(%1) \n\t"
248  "movq (%1), %%mm0 \n\t"
249  "movq 8(%1), %%mm2 \n\t"
250  "movq %%mm0, %%mm1 \n\t"
251  "movq %%mm2, %%mm3 \n\t"
252  "psrlq $1, %%mm0 \n\t"
253  "psrlq $1, %%mm2 \n\t"
254  "pand %%mm7, %%mm0 \n\t"
255  "pand %%mm7, %%mm2 \n\t"
256  "pand %%mm6, %%mm1 \n\t"
257  "pand %%mm6, %%mm3 \n\t"
258  "por %%mm1, %%mm0 \n\t"
259  "por %%mm3, %%mm2 \n\t"
260  MOVNTQ" %%mm0, (%0) \n\t"
261  MOVNTQ" %%mm2, 8(%0)"
262  :: "r"(d), "r"(s)
263  );
264  d+=16;
265  s+=16;
266  }
267  __asm__ volatile(SFENCE:::"memory");
268  __asm__ volatile(EMMS:::"memory");
269  mm_end = end - 3;
270  while (s < mm_end) {
271  register uint32_t x= *((const uint32_t*)s);
272  *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
273  s+=4;
274  d+=4;
275  }
276  if (s < end) {
277  register uint16_t x= *((const uint16_t*)s);
278  *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
279  }
280 }
281 
282 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
283 {
284  const uint8_t *s = src;
285  const uint8_t *end;
286  const uint8_t *mm_end;
287  uint16_t *d = (uint16_t *)dst;
288  end = s + src_size;
289  mm_end = end - 15;
290  __asm__ volatile(
291  "movq %3, %%mm5 \n\t"
292  "movq %4, %%mm6 \n\t"
293  "movq %5, %%mm7 \n\t"
294  "jmp 2f \n\t"
295  ".p2align 4 \n\t"
296  "1: \n\t"
297  PREFETCH" 32(%1) \n\t"
298  "movd (%1), %%mm0 \n\t"
299  "movd 4(%1), %%mm3 \n\t"
300  "punpckldq 8(%1), %%mm0 \n\t"
301  "punpckldq 12(%1), %%mm3 \n\t"
302  "movq %%mm0, %%mm1 \n\t"
303  "movq %%mm3, %%mm4 \n\t"
304  "pand %%mm6, %%mm0 \n\t"
305  "pand %%mm6, %%mm3 \n\t"
306  "pmaddwd %%mm7, %%mm0 \n\t"
307  "pmaddwd %%mm7, %%mm3 \n\t"
308  "pand %%mm5, %%mm1 \n\t"
309  "pand %%mm5, %%mm4 \n\t"
310  "por %%mm1, %%mm0 \n\t"
311  "por %%mm4, %%mm3 \n\t"
312  "psrld $5, %%mm0 \n\t"
313  "pslld $11, %%mm3 \n\t"
314  "por %%mm3, %%mm0 \n\t"
315  MOVNTQ" %%mm0, (%0) \n\t"
316  "add $16, %1 \n\t"
317  "add $8, %0 \n\t"
318  "2: \n\t"
319  "cmp %2, %1 \n\t"
320  " jb 1b \n\t"
321  : "+r" (d), "+r"(s)
322  : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
323  );
324  __asm__ volatile(SFENCE:::"memory");
325  __asm__ volatile(EMMS:::"memory");
326  while (s < end) {
327  register int rgb = *(const uint32_t*)s; s += 4;
328  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
329  }
330 }
331 
332 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
333 {
334  const uint8_t *s = src;
335  const uint8_t *end;
336  const uint8_t *mm_end;
337  uint16_t *d = (uint16_t *)dst;
338  end = s + src_size;
339  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
340  __asm__ volatile(
341  "movq %0, %%mm7 \n\t"
342  "movq %1, %%mm6 \n\t"
343  ::"m"(red_16mask),"m"(green_16mask));
344  mm_end = end - 15;
345  while (s < mm_end) {
346  __asm__ volatile(
347  PREFETCH" 32(%1) \n\t"
348  "movd (%1), %%mm0 \n\t"
349  "movd 4(%1), %%mm3 \n\t"
350  "punpckldq 8(%1), %%mm0 \n\t"
351  "punpckldq 12(%1), %%mm3 \n\t"
352  "movq %%mm0, %%mm1 \n\t"
353  "movq %%mm0, %%mm2 \n\t"
354  "movq %%mm3, %%mm4 \n\t"
355  "movq %%mm3, %%mm5 \n\t"
356  "psllq $8, %%mm0 \n\t"
357  "psllq $8, %%mm3 \n\t"
358  "pand %%mm7, %%mm0 \n\t"
359  "pand %%mm7, %%mm3 \n\t"
360  "psrlq $5, %%mm1 \n\t"
361  "psrlq $5, %%mm4 \n\t"
362  "pand %%mm6, %%mm1 \n\t"
363  "pand %%mm6, %%mm4 \n\t"
364  "psrlq $19, %%mm2 \n\t"
365  "psrlq $19, %%mm5 \n\t"
366  "pand %2, %%mm2 \n\t"
367  "pand %2, %%mm5 \n\t"
368  "por %%mm1, %%mm0 \n\t"
369  "por %%mm4, %%mm3 \n\t"
370  "por %%mm2, %%mm0 \n\t"
371  "por %%mm5, %%mm3 \n\t"
372  "psllq $16, %%mm3 \n\t"
373  "por %%mm3, %%mm0 \n\t"
374  MOVNTQ" %%mm0, (%0) \n\t"
375  :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
376  d += 4;
377  s += 16;
378  }
379  __asm__ volatile(SFENCE:::"memory");
380  __asm__ volatile(EMMS:::"memory");
381  while (s < end) {
382  register int rgb = *(const uint32_t*)s; s += 4;
383  *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
384  }
385 }
386 
387 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
388 {
389  const uint8_t *s = src;
390  const uint8_t *end;
391  const uint8_t *mm_end;
392  uint16_t *d = (uint16_t *)dst;
393  end = s + src_size;
394  mm_end = end - 15;
395  __asm__ volatile(
396  "movq %3, %%mm5 \n\t"
397  "movq %4, %%mm6 \n\t"
398  "movq %5, %%mm7 \n\t"
399  "jmp 2f \n\t"
400  ".p2align 4 \n\t"
401  "1: \n\t"
402  PREFETCH" 32(%1) \n\t"
403  "movd (%1), %%mm0 \n\t"
404  "movd 4(%1), %%mm3 \n\t"
405  "punpckldq 8(%1), %%mm0 \n\t"
406  "punpckldq 12(%1), %%mm3 \n\t"
407  "movq %%mm0, %%mm1 \n\t"
408  "movq %%mm3, %%mm4 \n\t"
409  "pand %%mm6, %%mm0 \n\t"
410  "pand %%mm6, %%mm3 \n\t"
411  "pmaddwd %%mm7, %%mm0 \n\t"
412  "pmaddwd %%mm7, %%mm3 \n\t"
413  "pand %%mm5, %%mm1 \n\t"
414  "pand %%mm5, %%mm4 \n\t"
415  "por %%mm1, %%mm0 \n\t"
416  "por %%mm4, %%mm3 \n\t"
417  "psrld $6, %%mm0 \n\t"
418  "pslld $10, %%mm3 \n\t"
419  "por %%mm3, %%mm0 \n\t"
420  MOVNTQ" %%mm0, (%0) \n\t"
421  "add $16, %1 \n\t"
422  "add $8, %0 \n\t"
423  "2: \n\t"
424  "cmp %2, %1 \n\t"
425  " jb 1b \n\t"
426  : "+r" (d), "+r"(s)
427  : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
428  );
429  __asm__ volatile(SFENCE:::"memory");
430  __asm__ volatile(EMMS:::"memory");
431  while (s < end) {
432  register int rgb = *(const uint32_t*)s; s += 4;
433  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
434  }
435 }
436 
437 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
438 {
439  const uint8_t *s = src;
440  const uint8_t *end;
441  const uint8_t *mm_end;
442  uint16_t *d = (uint16_t *)dst;
443  end = s + src_size;
444  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
445  __asm__ volatile(
446  "movq %0, %%mm7 \n\t"
447  "movq %1, %%mm6 \n\t"
448  ::"m"(red_15mask),"m"(green_15mask));
449  mm_end = end - 15;
450  while (s < mm_end) {
451  __asm__ volatile(
452  PREFETCH" 32(%1) \n\t"
453  "movd (%1), %%mm0 \n\t"
454  "movd 4(%1), %%mm3 \n\t"
455  "punpckldq 8(%1), %%mm0 \n\t"
456  "punpckldq 12(%1), %%mm3 \n\t"
457  "movq %%mm0, %%mm1 \n\t"
458  "movq %%mm0, %%mm2 \n\t"
459  "movq %%mm3, %%mm4 \n\t"
460  "movq %%mm3, %%mm5 \n\t"
461  "psllq $7, %%mm0 \n\t"
462  "psllq $7, %%mm3 \n\t"
463  "pand %%mm7, %%mm0 \n\t"
464  "pand %%mm7, %%mm3 \n\t"
465  "psrlq $6, %%mm1 \n\t"
466  "psrlq $6, %%mm4 \n\t"
467  "pand %%mm6, %%mm1 \n\t"
468  "pand %%mm6, %%mm4 \n\t"
469  "psrlq $19, %%mm2 \n\t"
470  "psrlq $19, %%mm5 \n\t"
471  "pand %2, %%mm2 \n\t"
472  "pand %2, %%mm5 \n\t"
473  "por %%mm1, %%mm0 \n\t"
474  "por %%mm4, %%mm3 \n\t"
475  "por %%mm2, %%mm0 \n\t"
476  "por %%mm5, %%mm3 \n\t"
477  "psllq $16, %%mm3 \n\t"
478  "por %%mm3, %%mm0 \n\t"
479  MOVNTQ" %%mm0, (%0) \n\t"
480  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
481  d += 4;
482  s += 16;
483  }
484  __asm__ volatile(SFENCE:::"memory");
485  __asm__ volatile(EMMS:::"memory");
486  while (s < end) {
487  register int rgb = *(const uint32_t*)s; s += 4;
488  *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
489  }
490 }
491 
492 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
493 {
494  const uint8_t *s = src;
495  const uint8_t *end;
496  const uint8_t *mm_end;
497  uint16_t *d = (uint16_t *)dst;
498  end = s + src_size;
499  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
500  __asm__ volatile(
501  "movq %0, %%mm7 \n\t"
502  "movq %1, %%mm6 \n\t"
503  ::"m"(red_16mask),"m"(green_16mask));
504  mm_end = end - 11;
505  while (s < mm_end) {
506  __asm__ volatile(
507  PREFETCH" 32(%1) \n\t"
508  "movd (%1), %%mm0 \n\t"
509  "movd 3(%1), %%mm3 \n\t"
510  "punpckldq 6(%1), %%mm0 \n\t"
511  "punpckldq 9(%1), %%mm3 \n\t"
512  "movq %%mm0, %%mm1 \n\t"
513  "movq %%mm0, %%mm2 \n\t"
514  "movq %%mm3, %%mm4 \n\t"
515  "movq %%mm3, %%mm5 \n\t"
516  "psrlq $3, %%mm0 \n\t"
517  "psrlq $3, %%mm3 \n\t"
518  "pand %2, %%mm0 \n\t"
519  "pand %2, %%mm3 \n\t"
520  "psrlq $5, %%mm1 \n\t"
521  "psrlq $5, %%mm4 \n\t"
522  "pand %%mm6, %%mm1 \n\t"
523  "pand %%mm6, %%mm4 \n\t"
524  "psrlq $8, %%mm2 \n\t"
525  "psrlq $8, %%mm5 \n\t"
526  "pand %%mm7, %%mm2 \n\t"
527  "pand %%mm7, %%mm5 \n\t"
528  "por %%mm1, %%mm0 \n\t"
529  "por %%mm4, %%mm3 \n\t"
530  "por %%mm2, %%mm0 \n\t"
531  "por %%mm5, %%mm3 \n\t"
532  "psllq $16, %%mm3 \n\t"
533  "por %%mm3, %%mm0 \n\t"
534  MOVNTQ" %%mm0, (%0) \n\t"
535  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
536  d += 4;
537  s += 12;
538  }
539  __asm__ volatile(SFENCE:::"memory");
540  __asm__ volatile(EMMS:::"memory");
541  while (s < end) {
542  const int b = *s++;
543  const int g = *s++;
544  const int r = *s++;
545  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
546  }
547 }
548 
549 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
550 {
551  const uint8_t *s = src;
552  const uint8_t *end;
553  const uint8_t *mm_end;
554  uint16_t *d = (uint16_t *)dst;
555  end = s + src_size;
556  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
557  __asm__ volatile(
558  "movq %0, %%mm7 \n\t"
559  "movq %1, %%mm6 \n\t"
560  ::"m"(red_16mask),"m"(green_16mask));
561  mm_end = end - 15;
562  while (s < mm_end) {
563  __asm__ volatile(
564  PREFETCH" 32(%1) \n\t"
565  "movd (%1), %%mm0 \n\t"
566  "movd 3(%1), %%mm3 \n\t"
567  "punpckldq 6(%1), %%mm0 \n\t"
568  "punpckldq 9(%1), %%mm3 \n\t"
569  "movq %%mm0, %%mm1 \n\t"
570  "movq %%mm0, %%mm2 \n\t"
571  "movq %%mm3, %%mm4 \n\t"
572  "movq %%mm3, %%mm5 \n\t"
573  "psllq $8, %%mm0 \n\t"
574  "psllq $8, %%mm3 \n\t"
575  "pand %%mm7, %%mm0 \n\t"
576  "pand %%mm7, %%mm3 \n\t"
577  "psrlq $5, %%mm1 \n\t"
578  "psrlq $5, %%mm4 \n\t"
579  "pand %%mm6, %%mm1 \n\t"
580  "pand %%mm6, %%mm4 \n\t"
581  "psrlq $19, %%mm2 \n\t"
582  "psrlq $19, %%mm5 \n\t"
583  "pand %2, %%mm2 \n\t"
584  "pand %2, %%mm5 \n\t"
585  "por %%mm1, %%mm0 \n\t"
586  "por %%mm4, %%mm3 \n\t"
587  "por %%mm2, %%mm0 \n\t"
588  "por %%mm5, %%mm3 \n\t"
589  "psllq $16, %%mm3 \n\t"
590  "por %%mm3, %%mm0 \n\t"
591  MOVNTQ" %%mm0, (%0) \n\t"
592  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
593  d += 4;
594  s += 12;
595  }
596  __asm__ volatile(SFENCE:::"memory");
597  __asm__ volatile(EMMS:::"memory");
598  while (s < end) {
599  const int r = *s++;
600  const int g = *s++;
601  const int b = *s++;
602  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
603  }
604 }
605 
606 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
607 {
608  const uint8_t *s = src;
609  const uint8_t *end;
610  const uint8_t *mm_end;
611  uint16_t *d = (uint16_t *)dst;
612  end = s + src_size;
613  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
614  __asm__ volatile(
615  "movq %0, %%mm7 \n\t"
616  "movq %1, %%mm6 \n\t"
617  ::"m"(red_15mask),"m"(green_15mask));
618  mm_end = end - 11;
619  while (s < mm_end) {
620  __asm__ volatile(
621  PREFETCH" 32(%1) \n\t"
622  "movd (%1), %%mm0 \n\t"
623  "movd 3(%1), %%mm3 \n\t"
624  "punpckldq 6(%1), %%mm0 \n\t"
625  "punpckldq 9(%1), %%mm3 \n\t"
626  "movq %%mm0, %%mm1 \n\t"
627  "movq %%mm0, %%mm2 \n\t"
628  "movq %%mm3, %%mm4 \n\t"
629  "movq %%mm3, %%mm5 \n\t"
630  "psrlq $3, %%mm0 \n\t"
631  "psrlq $3, %%mm3 \n\t"
632  "pand %2, %%mm0 \n\t"
633  "pand %2, %%mm3 \n\t"
634  "psrlq $6, %%mm1 \n\t"
635  "psrlq $6, %%mm4 \n\t"
636  "pand %%mm6, %%mm1 \n\t"
637  "pand %%mm6, %%mm4 \n\t"
638  "psrlq $9, %%mm2 \n\t"
639  "psrlq $9, %%mm5 \n\t"
640  "pand %%mm7, %%mm2 \n\t"
641  "pand %%mm7, %%mm5 \n\t"
642  "por %%mm1, %%mm0 \n\t"
643  "por %%mm4, %%mm3 \n\t"
644  "por %%mm2, %%mm0 \n\t"
645  "por %%mm5, %%mm3 \n\t"
646  "psllq $16, %%mm3 \n\t"
647  "por %%mm3, %%mm0 \n\t"
648  MOVNTQ" %%mm0, (%0) \n\t"
649  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
650  d += 4;
651  s += 12;
652  }
653  __asm__ volatile(SFENCE:::"memory");
654  __asm__ volatile(EMMS:::"memory");
655  while (s < end) {
656  const int b = *s++;
657  const int g = *s++;
658  const int r = *s++;
659  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
660  }
661 }
662 
663 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
664 {
665  const uint8_t *s = src;
666  const uint8_t *end;
667  const uint8_t *mm_end;
668  uint16_t *d = (uint16_t *)dst;
669  end = s + src_size;
670  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
671  __asm__ volatile(
672  "movq %0, %%mm7 \n\t"
673  "movq %1, %%mm6 \n\t"
674  ::"m"(red_15mask),"m"(green_15mask));
675  mm_end = end - 15;
676  while (s < mm_end) {
677  __asm__ volatile(
678  PREFETCH" 32(%1) \n\t"
679  "movd (%1), %%mm0 \n\t"
680  "movd 3(%1), %%mm3 \n\t"
681  "punpckldq 6(%1), %%mm0 \n\t"
682  "punpckldq 9(%1), %%mm3 \n\t"
683  "movq %%mm0, %%mm1 \n\t"
684  "movq %%mm0, %%mm2 \n\t"
685  "movq %%mm3, %%mm4 \n\t"
686  "movq %%mm3, %%mm5 \n\t"
687  "psllq $7, %%mm0 \n\t"
688  "psllq $7, %%mm3 \n\t"
689  "pand %%mm7, %%mm0 \n\t"
690  "pand %%mm7, %%mm3 \n\t"
691  "psrlq $6, %%mm1 \n\t"
692  "psrlq $6, %%mm4 \n\t"
693  "pand %%mm6, %%mm1 \n\t"
694  "pand %%mm6, %%mm4 \n\t"
695  "psrlq $19, %%mm2 \n\t"
696  "psrlq $19, %%mm5 \n\t"
697  "pand %2, %%mm2 \n\t"
698  "pand %2, %%mm5 \n\t"
699  "por %%mm1, %%mm0 \n\t"
700  "por %%mm4, %%mm3 \n\t"
701  "por %%mm2, %%mm0 \n\t"
702  "por %%mm5, %%mm3 \n\t"
703  "psllq $16, %%mm3 \n\t"
704  "por %%mm3, %%mm0 \n\t"
705  MOVNTQ" %%mm0, (%0) \n\t"
706  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
707  d += 4;
708  s += 12;
709  }
710  __asm__ volatile(SFENCE:::"memory");
711  __asm__ volatile(EMMS:::"memory");
712  while (s < end) {
713  const int r = *s++;
714  const int g = *s++;
715  const int b = *s++;
716  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
717  }
718 }
719 
720 /*
721  I use less accurate approximation here by simply left-shifting the input
722  value and filling the low order bits with zeroes. This method improves PNG
723  compression but this scheme cannot reproduce white exactly, since it does
724  not generate an all-ones maximum value; the net effect is to darken the
725  image slightly.
726 
727  The better method should be "left bit replication":
728 
729  4 3 2 1 0
730  ---------
731  1 1 0 1 1
732 
733  7 6 5 4 3 2 1 0
734  ----------------
735  1 1 0 1 1 1 1 0
736  |=======| |===|
737  | leftmost bits repeated to fill open bits
738  |
739  original bits
740 */
741 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
742 {
743  const uint16_t *end;
744  const uint16_t *mm_end;
745  uint8_t *d = dst;
746  const uint16_t *s = (const uint16_t*)src;
747  end = s + src_size/2;
748  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
749  mm_end = end - 7;
750  while (s < mm_end) {
751  __asm__ volatile(
752  PREFETCH" 32(%1) \n\t"
753  "movq (%1), %%mm0 \n\t"
754  "movq (%1), %%mm1 \n\t"
755  "movq (%1), %%mm2 \n\t"
756  "pand %2, %%mm0 \n\t"
757  "pand %3, %%mm1 \n\t"
758  "pand %4, %%mm2 \n\t"
759  "psllq $3, %%mm0 \n\t"
760  "psrlq $2, %%mm1 \n\t"
761  "psrlq $7, %%mm2 \n\t"
762  "movq %%mm0, %%mm3 \n\t"
763  "movq %%mm1, %%mm4 \n\t"
764  "movq %%mm2, %%mm5 \n\t"
765  "punpcklwd %5, %%mm0 \n\t"
766  "punpcklwd %5, %%mm1 \n\t"
767  "punpcklwd %5, %%mm2 \n\t"
768  "punpckhwd %5, %%mm3 \n\t"
769  "punpckhwd %5, %%mm4 \n\t"
770  "punpckhwd %5, %%mm5 \n\t"
771  "psllq $8, %%mm1 \n\t"
772  "psllq $16, %%mm2 \n\t"
773  "por %%mm1, %%mm0 \n\t"
774  "por %%mm2, %%mm0 \n\t"
775  "psllq $8, %%mm4 \n\t"
776  "psllq $16, %%mm5 \n\t"
777  "por %%mm4, %%mm3 \n\t"
778  "por %%mm5, %%mm3 \n\t"
779 
780  "movq %%mm0, %%mm6 \n\t"
781  "movq %%mm3, %%mm7 \n\t"
782 
783  "movq 8(%1), %%mm0 \n\t"
784  "movq 8(%1), %%mm1 \n\t"
785  "movq 8(%1), %%mm2 \n\t"
786  "pand %2, %%mm0 \n\t"
787  "pand %3, %%mm1 \n\t"
788  "pand %4, %%mm2 \n\t"
789  "psllq $3, %%mm0 \n\t"
790  "psrlq $2, %%mm1 \n\t"
791  "psrlq $7, %%mm2 \n\t"
792  "movq %%mm0, %%mm3 \n\t"
793  "movq %%mm1, %%mm4 \n\t"
794  "movq %%mm2, %%mm5 \n\t"
795  "punpcklwd %5, %%mm0 \n\t"
796  "punpcklwd %5, %%mm1 \n\t"
797  "punpcklwd %5, %%mm2 \n\t"
798  "punpckhwd %5, %%mm3 \n\t"
799  "punpckhwd %5, %%mm4 \n\t"
800  "punpckhwd %5, %%mm5 \n\t"
801  "psllq $8, %%mm1 \n\t"
802  "psllq $16, %%mm2 \n\t"
803  "por %%mm1, %%mm0 \n\t"
804  "por %%mm2, %%mm0 \n\t"
805  "psllq $8, %%mm4 \n\t"
806  "psllq $16, %%mm5 \n\t"
807  "por %%mm4, %%mm3 \n\t"
808  "por %%mm5, %%mm3 \n\t"
809 
810  :"=m"(*d)
811  :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
812  :"memory");
813  /* borrowed 32 to 24 */
814  __asm__ volatile(
815  "movq %%mm0, %%mm4 \n\t"
816  "movq %%mm3, %%mm5 \n\t"
817  "movq %%mm6, %%mm0 \n\t"
818  "movq %%mm7, %%mm1 \n\t"
819 
820  "movq %%mm4, %%mm6 \n\t"
821  "movq %%mm5, %%mm7 \n\t"
822  "movq %%mm0, %%mm2 \n\t"
823  "movq %%mm1, %%mm3 \n\t"
824 
826 
827  :: "r"(d), "m"(*s)
828  :"memory");
829  d += 24;
830  s += 8;
831  }
832  __asm__ volatile(SFENCE:::"memory");
833  __asm__ volatile(EMMS:::"memory");
834  while (s < end) {
835  register uint16_t bgr;
836  bgr = *s++;
837  *d++ = (bgr&0x1F)<<3;
838  *d++ = (bgr&0x3E0)>>2;
839  *d++ = (bgr&0x7C00)>>7;
840  }
841 }
842 
843 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
844 {
845  const uint16_t *end;
846  const uint16_t *mm_end;
847  uint8_t *d = (uint8_t *)dst;
848  const uint16_t *s = (const uint16_t *)src;
849  end = s + src_size/2;
850  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
851  mm_end = end - 7;
852  while (s < mm_end) {
853  __asm__ volatile(
854  PREFETCH" 32(%1) \n\t"
855  "movq (%1), %%mm0 \n\t"
856  "movq (%1), %%mm1 \n\t"
857  "movq (%1), %%mm2 \n\t"
858  "pand %2, %%mm0 \n\t"
859  "pand %3, %%mm1 \n\t"
860  "pand %4, %%mm2 \n\t"
861  "psllq $3, %%mm0 \n\t"
862  "psrlq $3, %%mm1 \n\t"
863  "psrlq $8, %%mm2 \n\t"
864  "movq %%mm0, %%mm3 \n\t"
865  "movq %%mm1, %%mm4 \n\t"
866  "movq %%mm2, %%mm5 \n\t"
867  "punpcklwd %5, %%mm0 \n\t"
868  "punpcklwd %5, %%mm1 \n\t"
869  "punpcklwd %5, %%mm2 \n\t"
870  "punpckhwd %5, %%mm3 \n\t"
871  "punpckhwd %5, %%mm4 \n\t"
872  "punpckhwd %5, %%mm5 \n\t"
873  "psllq $8, %%mm1 \n\t"
874  "psllq $16, %%mm2 \n\t"
875  "por %%mm1, %%mm0 \n\t"
876  "por %%mm2, %%mm0 \n\t"
877  "psllq $8, %%mm4 \n\t"
878  "psllq $16, %%mm5 \n\t"
879  "por %%mm4, %%mm3 \n\t"
880  "por %%mm5, %%mm3 \n\t"
881 
882  "movq %%mm0, %%mm6 \n\t"
883  "movq %%mm3, %%mm7 \n\t"
884 
885  "movq 8(%1), %%mm0 \n\t"
886  "movq 8(%1), %%mm1 \n\t"
887  "movq 8(%1), %%mm2 \n\t"
888  "pand %2, %%mm0 \n\t"
889  "pand %3, %%mm1 \n\t"
890  "pand %4, %%mm2 \n\t"
891  "psllq $3, %%mm0 \n\t"
892  "psrlq $3, %%mm1 \n\t"
893  "psrlq $8, %%mm2 \n\t"
894  "movq %%mm0, %%mm3 \n\t"
895  "movq %%mm1, %%mm4 \n\t"
896  "movq %%mm2, %%mm5 \n\t"
897  "punpcklwd %5, %%mm0 \n\t"
898  "punpcklwd %5, %%mm1 \n\t"
899  "punpcklwd %5, %%mm2 \n\t"
900  "punpckhwd %5, %%mm3 \n\t"
901  "punpckhwd %5, %%mm4 \n\t"
902  "punpckhwd %5, %%mm5 \n\t"
903  "psllq $8, %%mm1 \n\t"
904  "psllq $16, %%mm2 \n\t"
905  "por %%mm1, %%mm0 \n\t"
906  "por %%mm2, %%mm0 \n\t"
907  "psllq $8, %%mm4 \n\t"
908  "psllq $16, %%mm5 \n\t"
909  "por %%mm4, %%mm3 \n\t"
910  "por %%mm5, %%mm3 \n\t"
911  :"=m"(*d)
912  :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
913  :"memory");
914  /* borrowed 32 to 24 */
915  __asm__ volatile(
916  "movq %%mm0, %%mm4 \n\t"
917  "movq %%mm3, %%mm5 \n\t"
918  "movq %%mm6, %%mm0 \n\t"
919  "movq %%mm7, %%mm1 \n\t"
920 
921  "movq %%mm4, %%mm6 \n\t"
922  "movq %%mm5, %%mm7 \n\t"
923  "movq %%mm0, %%mm2 \n\t"
924  "movq %%mm1, %%mm3 \n\t"
925 
927 
928  :: "r"(d), "m"(*s)
929  :"memory");
930  d += 24;
931  s += 8;
932  }
933  __asm__ volatile(SFENCE:::"memory");
934  __asm__ volatile(EMMS:::"memory");
935  while (s < end) {
936  register uint16_t bgr;
937  bgr = *s++;
938  *d++ = (bgr&0x1F)<<3;
939  *d++ = (bgr&0x7E0)>>3;
940  *d++ = (bgr&0xF800)>>8;
941  }
942 }
943 
944 /*
945  * mm0 = 00 B3 00 B2 00 B1 00 B0
946  * mm1 = 00 G3 00 G2 00 G1 00 G0
947  * mm2 = 00 R3 00 R2 00 R1 00 R0
948  * mm6 = FF FF FF FF FF FF FF FF
949  * mm7 = 00 00 00 00 00 00 00 00
950  */
951 #define PACK_RGB32 \
952  "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
953  "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
954  "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
955  "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
956  "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
957  "movq %%mm0, %%mm3 \n\t" \
958  "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
959  "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
960  MOVNTQ" %%mm0, (%0) \n\t" \
961  MOVNTQ" %%mm3, 8(%0) \n\t" \
962 
963 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
964 {
965  const uint16_t *end;
966  const uint16_t *mm_end;
967  uint8_t *d = dst;
968  const uint16_t *s = (const uint16_t *)src;
969  end = s + src_size/2;
970  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
971  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
972  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
973  mm_end = end - 3;
974  while (s < mm_end) {
975  __asm__ volatile(
976  PREFETCH" 32(%1) \n\t"
977  "movq (%1), %%mm0 \n\t"
978  "movq (%1), %%mm1 \n\t"
979  "movq (%1), %%mm2 \n\t"
980  "pand %2, %%mm0 \n\t"
981  "pand %3, %%mm1 \n\t"
982  "pand %4, %%mm2 \n\t"
983  "psllq $3, %%mm0 \n\t"
984  "psrlq $2, %%mm1 \n\t"
985  "psrlq $7, %%mm2 \n\t"
986  PACK_RGB32
987  ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
988  :"memory");
989  d += 16;
990  s += 4;
991  }
992  __asm__ volatile(SFENCE:::"memory");
993  __asm__ volatile(EMMS:::"memory");
994  while (s < end) {
995  register uint16_t bgr;
996  bgr = *s++;
997  *d++ = (bgr&0x1F)<<3;
998  *d++ = (bgr&0x3E0)>>2;
999  *d++ = (bgr&0x7C00)>>7;
1000  *d++ = 255;
1001  }
1002 }
1003 
1004 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1005 {
1006  const uint16_t *end;
1007  const uint16_t *mm_end;
1008  uint8_t *d = dst;
1009  const uint16_t *s = (const uint16_t*)src;
1010  end = s + src_size/2;
1011  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1012  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1013  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1014  mm_end = end - 3;
1015  while (s < mm_end) {
1016  __asm__ volatile(
1017  PREFETCH" 32(%1) \n\t"
1018  "movq (%1), %%mm0 \n\t"
1019  "movq (%1), %%mm1 \n\t"
1020  "movq (%1), %%mm2 \n\t"
1021  "pand %2, %%mm0 \n\t"
1022  "pand %3, %%mm1 \n\t"
1023  "pand %4, %%mm2 \n\t"
1024  "psllq $3, %%mm0 \n\t"
1025  "psrlq $3, %%mm1 \n\t"
1026  "psrlq $8, %%mm2 \n\t"
1027  PACK_RGB32
1028  ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1029  :"memory");
1030  d += 16;
1031  s += 4;
1032  }
1033  __asm__ volatile(SFENCE:::"memory");
1034  __asm__ volatile(EMMS:::"memory");
1035  while (s < end) {
1036  register uint16_t bgr;
1037  bgr = *s++;
1038  *d++ = (bgr&0x1F)<<3;
1039  *d++ = (bgr&0x7E0)>>3;
1040  *d++ = (bgr&0xF800)>>8;
1041  *d++ = 255;
1042  }
1043 }
1044 
1045 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1046 {
1047  x86_reg idx = 15 - src_size;
1048  const uint8_t *s = src-idx;
1049  uint8_t *d = dst-idx;
1050  __asm__ volatile(
1051  "test %0, %0 \n\t"
1052  "jns 2f \n\t"
1053  PREFETCH" (%1, %0) \n\t"
1054  "movq %3, %%mm7 \n\t"
1055  "pxor %4, %%mm7 \n\t"
1056  "movq %%mm7, %%mm6 \n\t"
1057  "pxor %5, %%mm7 \n\t"
1058  ".p2align 4 \n\t"
1059  "1: \n\t"
1060  PREFETCH" 32(%1, %0) \n\t"
1061  "movq (%1, %0), %%mm0 \n\t"
1062  "movq 8(%1, %0), %%mm1 \n\t"
1063 # if COMPILE_TEMPLATE_MMXEXT
1064  "pshufw $177, %%mm0, %%mm3 \n\t"
1065  "pshufw $177, %%mm1, %%mm5 \n\t"
1066  "pand %%mm7, %%mm0 \n\t"
1067  "pand %%mm6, %%mm3 \n\t"
1068  "pand %%mm7, %%mm1 \n\t"
1069  "pand %%mm6, %%mm5 \n\t"
1070  "por %%mm3, %%mm0 \n\t"
1071  "por %%mm5, %%mm1 \n\t"
1072 # else
1073  "movq %%mm0, %%mm2 \n\t"
1074  "movq %%mm1, %%mm4 \n\t"
1075  "pand %%mm7, %%mm0 \n\t"
1076  "pand %%mm6, %%mm2 \n\t"
1077  "pand %%mm7, %%mm1 \n\t"
1078  "pand %%mm6, %%mm4 \n\t"
1079  "movq %%mm2, %%mm3 \n\t"
1080  "movq %%mm4, %%mm5 \n\t"
1081  "pslld $16, %%mm2 \n\t"
1082  "psrld $16, %%mm3 \n\t"
1083  "pslld $16, %%mm4 \n\t"
1084  "psrld $16, %%mm5 \n\t"
1085  "por %%mm2, %%mm0 \n\t"
1086  "por %%mm4, %%mm1 \n\t"
1087  "por %%mm3, %%mm0 \n\t"
1088  "por %%mm5, %%mm1 \n\t"
1089 # endif
1090  MOVNTQ" %%mm0, (%2, %0) \n\t"
1091  MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1092  "add $16, %0 \n\t"
1093  "js 1b \n\t"
1094  SFENCE" \n\t"
1095  EMMS" \n\t"
1096  "2: \n\t"
1097  : "+&r"(idx)
1098  : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1099  : "memory");
1100  for (; idx<15; idx+=4) {
1101  register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1102  v &= 0xff00ff;
1103  *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1104  }
1105 }
1106 
1107 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1108 {
1109  unsigned i;
1110  x86_reg mmx_size= 23 - src_size;
1111  __asm__ volatile (
1112  "test %%"REG_a", %%"REG_a" \n\t"
1113  "jns 2f \n\t"
1114  "movq "MANGLE(mask24r)", %%mm5 \n\t"
1115  "movq "MANGLE(mask24g)", %%mm6 \n\t"
1116  "movq "MANGLE(mask24b)", %%mm7 \n\t"
1117  ".p2align 4 \n\t"
1118  "1: \n\t"
1119  PREFETCH" 32(%1, %%"REG_a") \n\t"
1120  "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1121  "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1122  "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1123  "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1124  "pand %%mm5, %%mm0 \n\t"
1125  "pand %%mm6, %%mm1 \n\t"
1126  "pand %%mm7, %%mm2 \n\t"
1127  "por %%mm0, %%mm1 \n\t"
1128  "por %%mm2, %%mm1 \n\t"
1129  "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1130  MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1131  "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1132  "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1133  "pand %%mm7, %%mm0 \n\t"
1134  "pand %%mm5, %%mm1 \n\t"
1135  "pand %%mm6, %%mm2 \n\t"
1136  "por %%mm0, %%mm1 \n\t"
1137  "por %%mm2, %%mm1 \n\t"
1138  "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1139  MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1140  "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1141  "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1142  "pand %%mm6, %%mm0 \n\t"
1143  "pand %%mm7, %%mm1 \n\t"
1144  "pand %%mm5, %%mm2 \n\t"
1145  "por %%mm0, %%mm1 \n\t"
1146  "por %%mm2, %%mm1 \n\t"
1147  MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1148  "add $24, %%"REG_a" \n\t"
1149  " js 1b \n\t"
1150  "2: \n\t"
1151  : "+a" (mmx_size)
1152  : "r" (src-mmx_size), "r"(dst-mmx_size)
1153  );
1154 
1155  __asm__ volatile(SFENCE:::"memory");
1156  __asm__ volatile(EMMS:::"memory");
1157 
1158  if (mmx_size==23) return; //finished, was multiple of 8
1159 
1160  src+= src_size;
1161  dst+= src_size;
1162  src_size= 23-mmx_size;
1163  src-= src_size;
1164  dst-= src_size;
1165  for (i=0; i<src_size; i+=3) {
1166  register uint8_t x;
1167  x = src[i + 2];
1168  dst[i + 1] = src[i + 1];
1169  dst[i + 2] = src[i + 0];
1170  dst[i + 0] = x;
1171  }
1172 }
1173 
1174 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1175  int width, int height,
1176  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1177 {
1178  int y;
1179  const x86_reg chromWidth= width>>1;
1180  for (y=0; y<height; y++) {
1181  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1182  __asm__ volatile(
1183  "xor %%"REG_a", %%"REG_a" \n\t"
1184  ".p2align 4 \n\t"
1185  "1: \n\t"
1186  PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1187  PREFETCH" 32(%2, %%"REG_a") \n\t"
1188  PREFETCH" 32(%3, %%"REG_a") \n\t"
1189  "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1190  "movq %%mm0, %%mm2 \n\t" // U(0)
1191  "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1192  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1193  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1194 
1195  "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1196  "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1197  "movq %%mm3, %%mm4 \n\t" // Y(0)
1198  "movq %%mm5, %%mm6 \n\t" // Y(8)
1199  "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1200  "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1201  "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1202  "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1203 
1204  MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1205  MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1206  MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1207  MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1208 
1209  "add $8, %%"REG_a" \n\t"
1210  "cmp %4, %%"REG_a" \n\t"
1211  " jb 1b \n\t"
1212  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1213  : "%"REG_a
1214  );
1215  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1216  usrc += chromStride;
1217  vsrc += chromStride;
1218  }
1219  ysrc += lumStride;
1220  dst += dstStride;
1221  }
1222  __asm__(EMMS" \n\t"
1223  SFENCE" \n\t"
1224  :::"memory");
1225 }
1226 
1231 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1232  int width, int height,
1233  int lumStride, int chromStride, int dstStride)
1234 {
1235  //FIXME interpolate chroma
1236  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1237 }
1238 
1239 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1240  int width, int height,
1241  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1242 {
1243  int y;
1244  const x86_reg chromWidth= width>>1;
1245  for (y=0; y<height; y++) {
1246  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1247  __asm__ volatile(
1248  "xor %%"REG_a", %%"REG_a" \n\t"
1249  ".p2align 4 \n\t"
1250  "1: \n\t"
1251  PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1252  PREFETCH" 32(%2, %%"REG_a") \n\t"
1253  PREFETCH" 32(%3, %%"REG_a") \n\t"
1254  "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1255  "movq %%mm0, %%mm2 \n\t" // U(0)
1256  "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1257  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1258  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1259 
1260  "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1261  "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1262  "movq %%mm0, %%mm4 \n\t" // Y(0)
1263  "movq %%mm2, %%mm6 \n\t" // Y(8)
1264  "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1265  "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1266  "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1267  "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1268 
1269  MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1270  MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1271  MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1272  MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1273 
1274  "add $8, %%"REG_a" \n\t"
1275  "cmp %4, %%"REG_a" \n\t"
1276  " jb 1b \n\t"
1277  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1278  : "%"REG_a
1279  );
1280  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1281  usrc += chromStride;
1282  vsrc += chromStride;
1283  }
1284  ysrc += lumStride;
1285  dst += dstStride;
1286  }
1287  __asm__(EMMS" \n\t"
1288  SFENCE" \n\t"
1289  :::"memory");
1290 }
1291 
1296 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1297  int width, int height,
1298  int lumStride, int chromStride, int dstStride)
1299 {
1300  //FIXME interpolate chroma
1301  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1302 }
1303 
1307 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1308  int width, int height,
1309  int lumStride, int chromStride, int dstStride)
1310 {
1311  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1312 }
1313 
1317 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1318  int width, int height,
1319  int lumStride, int chromStride, int dstStride)
1320 {
1321  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1322 }
1323 
1328 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1329  int width, int height,
1330  int lumStride, int chromStride, int srcStride)
1331 {
1332  int y;
1333  const x86_reg chromWidth= width>>1;
1334  for (y=0; y<height; y+=2) {
1335  __asm__ volatile(
1336  "xor %%"REG_a", %%"REG_a" \n\t"
1337  "pcmpeqw %%mm7, %%mm7 \n\t"
1338  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1339  ".p2align 4 \n\t"
1340  "1: \n\t"
1341  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1342  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1343  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1344  "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1345  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1346  "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1347  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1348  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1349  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1350  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1351  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1352 
1353  MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1354 
1355  "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1356  "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1357  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1358  "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1359  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1360  "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1361  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1362  "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1363  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1364  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1365 
1366  MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1367 
1368  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1369  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1370  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1371  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1372  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1373  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1374  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1375  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1376 
1377  MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1378  MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1379 
1380  "add $8, %%"REG_a" \n\t"
1381  "cmp %4, %%"REG_a" \n\t"
1382  " jb 1b \n\t"
1383  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1384  : "memory", "%"REG_a
1385  );
1386 
1387  ydst += lumStride;
1388  src += srcStride;
1389 
1390  __asm__ volatile(
1391  "xor %%"REG_a", %%"REG_a" \n\t"
1392  ".p2align 4 \n\t"
1393  "1: \n\t"
1394  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1395  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1396  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1397  "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1398  "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1399  "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1400  "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1401  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1402  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1403  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1404  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1405 
1406  MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1407  MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1408 
1409  "add $8, %%"REG_a" \n\t"
1410  "cmp %4, %%"REG_a" \n\t"
1411  " jb 1b \n\t"
1412 
1413  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1414  : "memory", "%"REG_a
1415  );
1416  udst += chromStride;
1417  vdst += chromStride;
1418  ydst += lumStride;
1419  src += srcStride;
1420  }
1421  __asm__ volatile(EMMS" \n\t"
1422  SFENCE" \n\t"
1423  :::"memory");
1424 }
1425 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1426 
1427 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1428 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1429 {
1430  int x,y;
1431 
1432  dst[0]= src[0];
1433 
1434  // first line
1435  for (x=0; x<srcWidth-1; x++) {
1436  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1437  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1438  }
1439  dst[2*srcWidth-1]= src[srcWidth-1];
1440 
1441  dst+= dstStride;
1442 
1443  for (y=1; y<srcHeight; y++) {
1444  const x86_reg mmxSize= srcWidth&~15;
1445  __asm__ volatile(
1446  "mov %4, %%"REG_a" \n\t"
1447  "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1448  "movq (%0, %%"REG_a"), %%mm4 \n\t"
1449  "movq %%mm4, %%mm2 \n\t"
1450  "psllq $8, %%mm4 \n\t"
1451  "pand %%mm0, %%mm2 \n\t"
1452  "por %%mm2, %%mm4 \n\t"
1453  "movq (%1, %%"REG_a"), %%mm5 \n\t"
1454  "movq %%mm5, %%mm3 \n\t"
1455  "psllq $8, %%mm5 \n\t"
1456  "pand %%mm0, %%mm3 \n\t"
1457  "por %%mm3, %%mm5 \n\t"
1458  "1: \n\t"
1459  "movq (%0, %%"REG_a"), %%mm0 \n\t"
1460  "movq (%1, %%"REG_a"), %%mm1 \n\t"
1461  "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1462  "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1463  PAVGB" %%mm0, %%mm5 \n\t"
1464  PAVGB" %%mm0, %%mm3 \n\t"
1465  PAVGB" %%mm0, %%mm5 \n\t"
1466  PAVGB" %%mm0, %%mm3 \n\t"
1467  PAVGB" %%mm1, %%mm4 \n\t"
1468  PAVGB" %%mm1, %%mm2 \n\t"
1469  PAVGB" %%mm1, %%mm4 \n\t"
1470  PAVGB" %%mm1, %%mm2 \n\t"
1471  "movq %%mm5, %%mm7 \n\t"
1472  "movq %%mm4, %%mm6 \n\t"
1473  "punpcklbw %%mm3, %%mm5 \n\t"
1474  "punpckhbw %%mm3, %%mm7 \n\t"
1475  "punpcklbw %%mm2, %%mm4 \n\t"
1476  "punpckhbw %%mm2, %%mm6 \n\t"
1477  MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1478  MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1479  MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1480  MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1481  "add $8, %%"REG_a" \n\t"
1482  "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1483  "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1484  " js 1b \n\t"
1485  :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1486  "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1487  "g" (-mmxSize)
1488  : "%"REG_a
1489  );
1490 
1491  for (x=mmxSize-1; x<srcWidth-1; x++) {
1492  dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1493  dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1494  dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1495  dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1496  }
1497  dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1498  dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1499 
1500  dst+=dstStride*2;
1501  src+=srcStride;
1502  }
1503 
1504  // last line
1505  dst[0]= src[0];
1506 
1507  for (x=0; x<srcWidth-1; x++) {
1508  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1509  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1510  }
1511  dst[2*srcWidth-1]= src[srcWidth-1];
1512 
1513  __asm__ volatile(EMMS" \n\t"
1514  SFENCE" \n\t"
1515  :::"memory");
1516 }
1517 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1518 
1519 #if !COMPILE_TEMPLATE_AMD3DNOW
1520 
1526 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1527  int width, int height,
1528  int lumStride, int chromStride, int srcStride)
1529 {
1530  int y;
1531  const x86_reg chromWidth= width>>1;
1532  for (y=0; y<height; y+=2) {
1533  __asm__ volatile(
1534  "xor %%"REG_a", %%"REG_a" \n\t"
1535  "pcmpeqw %%mm7, %%mm7 \n\t"
1536  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1537  ".p2align 4 \n\t"
1538  "1: \n\t"
1539  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1540  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1541  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1542  "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1543  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1544  "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1545  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1546  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1547  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1548  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1549  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1550 
1551  MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1552 
1553  "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1554  "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1555  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1556  "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1557  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1558  "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1559  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1560  "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1561  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1562  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1563 
1564  MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1565 
1566  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1567  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1568  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1569  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1570  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1571  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1572  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1573  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1574 
1575  MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1576  MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1577 
1578  "add $8, %%"REG_a" \n\t"
1579  "cmp %4, %%"REG_a" \n\t"
1580  " jb 1b \n\t"
1581  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1582  : "memory", "%"REG_a
1583  );
1584 
1585  ydst += lumStride;
1586  src += srcStride;
1587 
1588  __asm__ volatile(
1589  "xor %%"REG_a", %%"REG_a" \n\t"
1590  ".p2align 4 \n\t"
1591  "1: \n\t"
1592  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1593  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1594  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1595  "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1596  "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1597  "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1598  "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1599  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1600  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1601  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1602  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1603 
1604  MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1605  MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1606 
1607  "add $8, %%"REG_a" \n\t"
1608  "cmp %4, %%"REG_a" \n\t"
1609  " jb 1b \n\t"
1610 
1611  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1612  : "memory", "%"REG_a
1613  );
1614  udst += chromStride;
1615  vdst += chromStride;
1616  ydst += lumStride;
1617  src += srcStride;
1618  }
1619  __asm__ volatile(EMMS" \n\t"
1620  SFENCE" \n\t"
1621  :::"memory");
1622 }
1623 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1624 
1632 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1633  int width, int height,
1634  int lumStride, int chromStride, int srcStride)
1635 {
1636  int y;
1637  const x86_reg chromWidth= width>>1;
1638  for (y=0; y<height-2; y+=2) {
1639  int i;
1640  for (i=0; i<2; i++) {
1641  __asm__ volatile(
1642  "mov %2, %%"REG_a" \n\t"
1643  "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1644  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1645  "pxor %%mm7, %%mm7 \n\t"
1646  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1647  ".p2align 4 \n\t"
1648  "1: \n\t"
1649  PREFETCH" 64(%0, %%"REG_d") \n\t"
1650  "movd (%0, %%"REG_d"), %%mm0 \n\t"
1651  "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1652  "punpcklbw %%mm7, %%mm0 \n\t"
1653  "punpcklbw %%mm7, %%mm1 \n\t"
1654  "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1655  "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1656  "punpcklbw %%mm7, %%mm2 \n\t"
1657  "punpcklbw %%mm7, %%mm3 \n\t"
1658  "pmaddwd %%mm6, %%mm0 \n\t"
1659  "pmaddwd %%mm6, %%mm1 \n\t"
1660  "pmaddwd %%mm6, %%mm2 \n\t"
1661  "pmaddwd %%mm6, %%mm3 \n\t"
1662 #ifndef FAST_BGR2YV12
1663  "psrad $8, %%mm0 \n\t"
1664  "psrad $8, %%mm1 \n\t"
1665  "psrad $8, %%mm2 \n\t"
1666  "psrad $8, %%mm3 \n\t"
1667 #endif
1668  "packssdw %%mm1, %%mm0 \n\t"
1669  "packssdw %%mm3, %%mm2 \n\t"
1670  "pmaddwd %%mm5, %%mm0 \n\t"
1671  "pmaddwd %%mm5, %%mm2 \n\t"
1672  "packssdw %%mm2, %%mm0 \n\t"
1673  "psraw $7, %%mm0 \n\t"
1674 
1675  "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1676  "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1677  "punpcklbw %%mm7, %%mm4 \n\t"
1678  "punpcklbw %%mm7, %%mm1 \n\t"
1679  "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1680  "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1681  "punpcklbw %%mm7, %%mm2 \n\t"
1682  "punpcklbw %%mm7, %%mm3 \n\t"
1683  "pmaddwd %%mm6, %%mm4 \n\t"
1684  "pmaddwd %%mm6, %%mm1 \n\t"
1685  "pmaddwd %%mm6, %%mm2 \n\t"
1686  "pmaddwd %%mm6, %%mm3 \n\t"
1687 #ifndef FAST_BGR2YV12
1688  "psrad $8, %%mm4 \n\t"
1689  "psrad $8, %%mm1 \n\t"
1690  "psrad $8, %%mm2 \n\t"
1691  "psrad $8, %%mm3 \n\t"
1692 #endif
1693  "packssdw %%mm1, %%mm4 \n\t"
1694  "packssdw %%mm3, %%mm2 \n\t"
1695  "pmaddwd %%mm5, %%mm4 \n\t"
1696  "pmaddwd %%mm5, %%mm2 \n\t"
1697  "add $24, %%"REG_d" \n\t"
1698  "packssdw %%mm2, %%mm4 \n\t"
1699  "psraw $7, %%mm4 \n\t"
1700 
1701  "packuswb %%mm4, %%mm0 \n\t"
1702  "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1703 
1704  MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1705  "add $8, %%"REG_a" \n\t"
1706  " js 1b \n\t"
1707  : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1708  : "%"REG_a, "%"REG_d
1709  );
1710  ydst += lumStride;
1711  src += srcStride;
1712  }
1713  src -= srcStride*2;
1714  __asm__ volatile(
1715  "mov %4, %%"REG_a" \n\t"
1716  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1717  "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1718  "pxor %%mm7, %%mm7 \n\t"
1719  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1720  "add %%"REG_d", %%"REG_d" \n\t"
1721  ".p2align 4 \n\t"
1722  "1: \n\t"
1723  PREFETCH" 64(%0, %%"REG_d") \n\t"
1724  PREFETCH" 64(%1, %%"REG_d") \n\t"
1725 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1726  "movq (%0, %%"REG_d"), %%mm0 \n\t"
1727  "movq (%1, %%"REG_d"), %%mm1 \n\t"
1728  "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1729  "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1730  PAVGB" %%mm1, %%mm0 \n\t"
1731  PAVGB" %%mm3, %%mm2 \n\t"
1732  "movq %%mm0, %%mm1 \n\t"
1733  "movq %%mm2, %%mm3 \n\t"
1734  "psrlq $24, %%mm0 \n\t"
1735  "psrlq $24, %%mm2 \n\t"
1736  PAVGB" %%mm1, %%mm0 \n\t"
1737  PAVGB" %%mm3, %%mm2 \n\t"
1738  "punpcklbw %%mm7, %%mm0 \n\t"
1739  "punpcklbw %%mm7, %%mm2 \n\t"
1740 #else
1741  "movd (%0, %%"REG_d"), %%mm0 \n\t"
1742  "movd (%1, %%"REG_d"), %%mm1 \n\t"
1743  "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1744  "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1745  "punpcklbw %%mm7, %%mm0 \n\t"
1746  "punpcklbw %%mm7, %%mm1 \n\t"
1747  "punpcklbw %%mm7, %%mm2 \n\t"
1748  "punpcklbw %%mm7, %%mm3 \n\t"
1749  "paddw %%mm1, %%mm0 \n\t"
1750  "paddw %%mm3, %%mm2 \n\t"
1751  "paddw %%mm2, %%mm0 \n\t"
1752  "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1753  "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1754  "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1755  "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1756  "punpcklbw %%mm7, %%mm4 \n\t"
1757  "punpcklbw %%mm7, %%mm1 \n\t"
1758  "punpcklbw %%mm7, %%mm2 \n\t"
1759  "punpcklbw %%mm7, %%mm3 \n\t"
1760  "paddw %%mm1, %%mm4 \n\t"
1761  "paddw %%mm3, %%mm2 \n\t"
1762  "paddw %%mm4, %%mm2 \n\t"
1763  "psrlw $2, %%mm0 \n\t"
1764  "psrlw $2, %%mm2 \n\t"
1765 #endif
1766  "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1767  "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1768 
1769  "pmaddwd %%mm0, %%mm1 \n\t"
1770  "pmaddwd %%mm2, %%mm3 \n\t"
1771  "pmaddwd %%mm6, %%mm0 \n\t"
1772  "pmaddwd %%mm6, %%mm2 \n\t"
1773 #ifndef FAST_BGR2YV12
1774  "psrad $8, %%mm0 \n\t"
1775  "psrad $8, %%mm1 \n\t"
1776  "psrad $8, %%mm2 \n\t"
1777  "psrad $8, %%mm3 \n\t"
1778 #endif
1779  "packssdw %%mm2, %%mm0 \n\t"
1780  "packssdw %%mm3, %%mm1 \n\t"
1781  "pmaddwd %%mm5, %%mm0 \n\t"
1782  "pmaddwd %%mm5, %%mm1 \n\t"
1783  "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1784  "psraw $7, %%mm0 \n\t"
1785 
1786 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1787  "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1788  "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1789  "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1790  "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1791  PAVGB" %%mm1, %%mm4 \n\t"
1792  PAVGB" %%mm3, %%mm2 \n\t"
1793  "movq %%mm4, %%mm1 \n\t"
1794  "movq %%mm2, %%mm3 \n\t"
1795  "psrlq $24, %%mm4 \n\t"
1796  "psrlq $24, %%mm2 \n\t"
1797  PAVGB" %%mm1, %%mm4 \n\t"
1798  PAVGB" %%mm3, %%mm2 \n\t"
1799  "punpcklbw %%mm7, %%mm4 \n\t"
1800  "punpcklbw %%mm7, %%mm2 \n\t"
1801 #else
1802  "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1803  "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1804  "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1805  "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1806  "punpcklbw %%mm7, %%mm4 \n\t"
1807  "punpcklbw %%mm7, %%mm1 \n\t"
1808  "punpcklbw %%mm7, %%mm2 \n\t"
1809  "punpcklbw %%mm7, %%mm3 \n\t"
1810  "paddw %%mm1, %%mm4 \n\t"
1811  "paddw %%mm3, %%mm2 \n\t"
1812  "paddw %%mm2, %%mm4 \n\t"
1813  "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1814  "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1815  "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1816  "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1817  "punpcklbw %%mm7, %%mm5 \n\t"
1818  "punpcklbw %%mm7, %%mm1 \n\t"
1819  "punpcklbw %%mm7, %%mm2 \n\t"
1820  "punpcklbw %%mm7, %%mm3 \n\t"
1821  "paddw %%mm1, %%mm5 \n\t"
1822  "paddw %%mm3, %%mm2 \n\t"
1823  "paddw %%mm5, %%mm2 \n\t"
1824  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1825  "psrlw $2, %%mm4 \n\t"
1826  "psrlw $2, %%mm2 \n\t"
1827 #endif
1828  "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1829  "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1830 
1831  "pmaddwd %%mm4, %%mm1 \n\t"
1832  "pmaddwd %%mm2, %%mm3 \n\t"
1833  "pmaddwd %%mm6, %%mm4 \n\t"
1834  "pmaddwd %%mm6, %%mm2 \n\t"
1835 #ifndef FAST_BGR2YV12
1836  "psrad $8, %%mm4 \n\t"
1837  "psrad $8, %%mm1 \n\t"
1838  "psrad $8, %%mm2 \n\t"
1839  "psrad $8, %%mm3 \n\t"
1840 #endif
1841  "packssdw %%mm2, %%mm4 \n\t"
1842  "packssdw %%mm3, %%mm1 \n\t"
1843  "pmaddwd %%mm5, %%mm4 \n\t"
1844  "pmaddwd %%mm5, %%mm1 \n\t"
1845  "add $24, %%"REG_d" \n\t"
1846  "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1847  "psraw $7, %%mm4 \n\t"
1848 
1849  "movq %%mm0, %%mm1 \n\t"
1850  "punpckldq %%mm4, %%mm0 \n\t"
1851  "punpckhdq %%mm4, %%mm1 \n\t"
1852  "packsswb %%mm1, %%mm0 \n\t"
1853  "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1854  "movd %%mm0, (%2, %%"REG_a") \n\t"
1855  "punpckhdq %%mm0, %%mm0 \n\t"
1856  "movd %%mm0, (%3, %%"REG_a") \n\t"
1857  "add $4, %%"REG_a" \n\t"
1858  " js 1b \n\t"
1859  : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1860  : "%"REG_a, "%"REG_d
1861  );
1862 
1863  udst += chromStride;
1864  vdst += chromStride;
1865  src += srcStride*2;
1866  }
1867 
1868  __asm__ volatile(EMMS" \n\t"
1869  SFENCE" \n\t"
1870  :::"memory");
1871 
1872  rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1873 }
1874 #endif /* !COMPILE_TEMPLATE_SSE2 */
1875 
1876 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1877 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1878  int width, int height, int src1Stride,
1879  int src2Stride, int dstStride)
1880 {
1881  int h;
1882 
1883  for (h=0; h < height; h++) {
1884  int w;
1885 
1886 #if COMPILE_TEMPLATE_SSE2
1887  __asm__(
1888  "xor %%"REG_a", %%"REG_a" \n\t"
1889  "1: \n\t"
1890  PREFETCH" 64(%1, %%"REG_a") \n\t"
1891  PREFETCH" 64(%2, %%"REG_a") \n\t"
1892  "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1893  "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1894  "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1895  "punpcklbw %%xmm2, %%xmm0 \n\t"
1896  "punpckhbw %%xmm2, %%xmm1 \n\t"
1897  "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1898  "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1899  "add $16, %%"REG_a" \n\t"
1900  "cmp %3, %%"REG_a" \n\t"
1901  " jb 1b \n\t"
1902  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1903  : "memory", "%"REG_a""
1904  );
1905 #else
1906  __asm__(
1907  "xor %%"REG_a", %%"REG_a" \n\t"
1908  "1: \n\t"
1909  PREFETCH" 64(%1, %%"REG_a") \n\t"
1910  PREFETCH" 64(%2, %%"REG_a") \n\t"
1911  "movq (%1, %%"REG_a"), %%mm0 \n\t"
1912  "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1913  "movq %%mm0, %%mm1 \n\t"
1914  "movq %%mm2, %%mm3 \n\t"
1915  "movq (%2, %%"REG_a"), %%mm4 \n\t"
1916  "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1917  "punpcklbw %%mm4, %%mm0 \n\t"
1918  "punpckhbw %%mm4, %%mm1 \n\t"
1919  "punpcklbw %%mm5, %%mm2 \n\t"
1920  "punpckhbw %%mm5, %%mm3 \n\t"
1921  MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1922  MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1923  MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1924  MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1925  "add $16, %%"REG_a" \n\t"
1926  "cmp %3, %%"REG_a" \n\t"
1927  " jb 1b \n\t"
1928  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1929  : "memory", "%"REG_a
1930  );
1931 #endif
1932  for (w= (width&(~15)); w < width; w++) {
1933  dest[2*w+0] = src1[w];
1934  dest[2*w+1] = src2[w];
1935  }
1936  dest += dstStride;
1937  src1 += src1Stride;
1938  src2 += src2Stride;
1939  }
1940  __asm__(
1941  EMMS" \n\t"
1942  SFENCE" \n\t"
1943  ::: "memory"
1944  );
1945 }
1946 #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
1947 
1948 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1949 void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1950  const uint8_t *src, const uint8_t *unused, int w,
1951  uint32_t *unused2);
1952 static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
1953  int width, int height, int srcStride,
1954  int dst1Stride, int dst2Stride)
1955 {
1956  int h;
1957 
1958  for (h = 0; h < height; h++) {
1959  RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL);
1960  src += srcStride;
1961  dst1 += dst1Stride;
1962  dst2 += dst2Stride;
1963  }
1964  __asm__(
1965  EMMS" \n\t"
1966  SFENCE" \n\t"
1967  ::: "memory"
1968  );
1969 }
1970 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1971 
1972 #if !COMPILE_TEMPLATE_SSE2
1973 #if !COMPILE_TEMPLATE_AMD3DNOW
1974 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1975  uint8_t *dst1, uint8_t *dst2,
1976  int width, int height,
1977  int srcStride1, int srcStride2,
1978  int dstStride1, int dstStride2)
1979 {
1980  x86_reg x, y;
1981  int w,h;
1982  w=width/2; h=height/2;
1983  __asm__ volatile(
1984  PREFETCH" %0 \n\t"
1985  PREFETCH" %1 \n\t"
1986  ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1987  for (y=0;y<h;y++) {
1988  const uint8_t* s1=src1+srcStride1*(y>>1);
1989  uint8_t* d=dst1+dstStride1*y;
1990  x=0;
1991  for (;x<w-31;x+=32) {
1992  __asm__ volatile(
1993  PREFETCH" 32(%1,%2) \n\t"
1994  "movq (%1,%2), %%mm0 \n\t"
1995  "movq 8(%1,%2), %%mm2 \n\t"
1996  "movq 16(%1,%2), %%mm4 \n\t"
1997  "movq 24(%1,%2), %%mm6 \n\t"
1998  "movq %%mm0, %%mm1 \n\t"
1999  "movq %%mm2, %%mm3 \n\t"
2000  "movq %%mm4, %%mm5 \n\t"
2001  "movq %%mm6, %%mm7 \n\t"
2002  "punpcklbw %%mm0, %%mm0 \n\t"
2003  "punpckhbw %%mm1, %%mm1 \n\t"
2004  "punpcklbw %%mm2, %%mm2 \n\t"
2005  "punpckhbw %%mm3, %%mm3 \n\t"
2006  "punpcklbw %%mm4, %%mm4 \n\t"
2007  "punpckhbw %%mm5, %%mm5 \n\t"
2008  "punpcklbw %%mm6, %%mm6 \n\t"
2009  "punpckhbw %%mm7, %%mm7 \n\t"
2010  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2011  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2012  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2013  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2014  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2015  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2016  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2017  MOVNTQ" %%mm7, 56(%0,%2,2)"
2018  :: "r"(d), "r"(s1), "r"(x)
2019  :"memory");
2020  }
2021  for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2022  }
2023  for (y=0;y<h;y++) {
2024  const uint8_t* s2=src2+srcStride2*(y>>1);
2025  uint8_t* d=dst2+dstStride2*y;
2026  x=0;
2027  for (;x<w-31;x+=32) {
2028  __asm__ volatile(
2029  PREFETCH" 32(%1,%2) \n\t"
2030  "movq (%1,%2), %%mm0 \n\t"
2031  "movq 8(%1,%2), %%mm2 \n\t"
2032  "movq 16(%1,%2), %%mm4 \n\t"
2033  "movq 24(%1,%2), %%mm6 \n\t"
2034  "movq %%mm0, %%mm1 \n\t"
2035  "movq %%mm2, %%mm3 \n\t"
2036  "movq %%mm4, %%mm5 \n\t"
2037  "movq %%mm6, %%mm7 \n\t"
2038  "punpcklbw %%mm0, %%mm0 \n\t"
2039  "punpckhbw %%mm1, %%mm1 \n\t"
2040  "punpcklbw %%mm2, %%mm2 \n\t"
2041  "punpckhbw %%mm3, %%mm3 \n\t"
2042  "punpcklbw %%mm4, %%mm4 \n\t"
2043  "punpckhbw %%mm5, %%mm5 \n\t"
2044  "punpcklbw %%mm6, %%mm6 \n\t"
2045  "punpckhbw %%mm7, %%mm7 \n\t"
2046  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2047  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2048  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2049  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2050  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2051  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2052  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2053  MOVNTQ" %%mm7, 56(%0,%2,2)"
2054  :: "r"(d), "r"(s2), "r"(x)
2055  :"memory");
2056  }
2057  for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2058  }
2059  __asm__(
2060  EMMS" \n\t"
2061  SFENCE" \n\t"
2062  ::: "memory"
2063  );
2064 }
2065 
2066 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2067  uint8_t *dst,
2068  int width, int height,
2069  int srcStride1, int srcStride2,
2070  int srcStride3, int dstStride)
2071 {
2072  x86_reg x;
2073  int y,w,h;
2074  w=width/2; h=height;
2075  for (y=0;y<h;y++) {
2076  const uint8_t* yp=src1+srcStride1*y;
2077  const uint8_t* up=src2+srcStride2*(y>>2);
2078  const uint8_t* vp=src3+srcStride3*(y>>2);
2079  uint8_t* d=dst+dstStride*y;
2080  x=0;
2081  for (;x<w-7;x+=8) {
2082  __asm__ volatile(
2083  PREFETCH" 32(%1, %0) \n\t"
2084  PREFETCH" 32(%2, %0) \n\t"
2085  PREFETCH" 32(%3, %0) \n\t"
2086  "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2087  "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2088  "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2089  "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2090  "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2091  "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2092  "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2093  "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2094  "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2095  "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2096 
2097  "movq %%mm1, %%mm6 \n\t"
2098  "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2099  "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2100  "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2101  MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2102  MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2103 
2104  "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2105  "movq 8(%1, %0, 4), %%mm0 \n\t"
2106  "movq %%mm0, %%mm3 \n\t"
2107  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2108  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2109  MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2110  MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2111 
2112  "movq %%mm4, %%mm6 \n\t"
2113  "movq 16(%1, %0, 4), %%mm0 \n\t"
2114  "movq %%mm0, %%mm3 \n\t"
2115  "punpcklbw %%mm5, %%mm4 \n\t"
2116  "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2117  "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2118  MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2119  MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2120 
2121  "punpckhbw %%mm5, %%mm6 \n\t"
2122  "movq 24(%1, %0, 4), %%mm0 \n\t"
2123  "movq %%mm0, %%mm3 \n\t"
2124  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2125  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2126  MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2127  MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2128 
2129  : "+r" (x)
2130  : "r"(yp), "r" (up), "r"(vp), "r"(d)
2131  :"memory");
2132  }
2133  for (; x<w; x++) {
2134  const int x2 = x<<2;
2135  d[8*x+0] = yp[x2];
2136  d[8*x+1] = up[x];
2137  d[8*x+2] = yp[x2+1];
2138  d[8*x+3] = vp[x];
2139  d[8*x+4] = yp[x2+2];
2140  d[8*x+5] = up[x];
2141  d[8*x+6] = yp[x2+3];
2142  d[8*x+7] = vp[x];
2143  }
2144  }
2145  __asm__(
2146  EMMS" \n\t"
2147  SFENCE" \n\t"
2148  ::: "memory"
2149  );
2150 }
2151 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2152 
2153 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2154 {
2155  dst += count;
2156  src += 2*count;
2157  count= - count;
2158 
2159  if(count <= -16) {
2160  count += 15;
2161  __asm__ volatile(
2162  "pcmpeqw %%mm7, %%mm7 \n\t"
2163  "psrlw $8, %%mm7 \n\t"
2164  "1: \n\t"
2165  "movq -30(%1, %0, 2), %%mm0 \n\t"
2166  "movq -22(%1, %0, 2), %%mm1 \n\t"
2167  "movq -14(%1, %0, 2), %%mm2 \n\t"
2168  "movq -6(%1, %0, 2), %%mm3 \n\t"
2169  "pand %%mm7, %%mm0 \n\t"
2170  "pand %%mm7, %%mm1 \n\t"
2171  "pand %%mm7, %%mm2 \n\t"
2172  "pand %%mm7, %%mm3 \n\t"
2173  "packuswb %%mm1, %%mm0 \n\t"
2174  "packuswb %%mm3, %%mm2 \n\t"
2175  MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2176  MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2177  "add $16, %0 \n\t"
2178  " js 1b \n\t"
2179  : "+r"(count)
2180  : "r"(src), "r"(dst)
2181  );
2182  count -= 15;
2183  }
2184  while(count<0) {
2185  dst[count]= src[2*count];
2186  count++;
2187  }
2188 }
2189 
2190 #if !COMPILE_TEMPLATE_AMD3DNOW
2191 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2192 {
2193  dst0+= count;
2194  dst1+= count;
2195  src += 4*count;
2196  count= - count;
2197  if(count <= -8) {
2198  count += 7;
2199  __asm__ volatile(
2200  "pcmpeqw %%mm7, %%mm7 \n\t"
2201  "psrlw $8, %%mm7 \n\t"
2202  "1: \n\t"
2203  "movq -28(%1, %0, 4), %%mm0 \n\t"
2204  "movq -20(%1, %0, 4), %%mm1 \n\t"
2205  "movq -12(%1, %0, 4), %%mm2 \n\t"
2206  "movq -4(%1, %0, 4), %%mm3 \n\t"
2207  "pand %%mm7, %%mm0 \n\t"
2208  "pand %%mm7, %%mm1 \n\t"
2209  "pand %%mm7, %%mm2 \n\t"
2210  "pand %%mm7, %%mm3 \n\t"
2211  "packuswb %%mm1, %%mm0 \n\t"
2212  "packuswb %%mm3, %%mm2 \n\t"
2213  "movq %%mm0, %%mm1 \n\t"
2214  "movq %%mm2, %%mm3 \n\t"
2215  "psrlw $8, %%mm0 \n\t"
2216  "psrlw $8, %%mm2 \n\t"
2217  "pand %%mm7, %%mm1 \n\t"
2218  "pand %%mm7, %%mm3 \n\t"
2219  "packuswb %%mm2, %%mm0 \n\t"
2220  "packuswb %%mm3, %%mm1 \n\t"
2221  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2222  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2223  "add $8, %0 \n\t"
2224  " js 1b \n\t"
2225  : "+r"(count)
2226  : "r"(src), "r"(dst0), "r"(dst1)
2227  );
2228  count -= 7;
2229  }
2230  while(count<0) {
2231  dst0[count]= src[4*count+0];
2232  dst1[count]= src[4*count+2];
2233  count++;
2234  }
2235 }
2236 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2237 
2238 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2239 {
2240  dst0 += count;
2241  dst1 += count;
2242  src0 += 4*count;
2243  src1 += 4*count;
2244  count= - count;
2245 #ifdef PAVGB
2246  if(count <= -8) {
2247  count += 7;
2248  __asm__ volatile(
2249  "pcmpeqw %%mm7, %%mm7 \n\t"
2250  "psrlw $8, %%mm7 \n\t"
2251  "1: \n\t"
2252  "movq -28(%1, %0, 4), %%mm0 \n\t"
2253  "movq -20(%1, %0, 4), %%mm1 \n\t"
2254  "movq -12(%1, %0, 4), %%mm2 \n\t"
2255  "movq -4(%1, %0, 4), %%mm3 \n\t"
2256  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2257  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2258  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2259  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2260  "pand %%mm7, %%mm0 \n\t"
2261  "pand %%mm7, %%mm1 \n\t"
2262  "pand %%mm7, %%mm2 \n\t"
2263  "pand %%mm7, %%mm3 \n\t"
2264  "packuswb %%mm1, %%mm0 \n\t"
2265  "packuswb %%mm3, %%mm2 \n\t"
2266  "movq %%mm0, %%mm1 \n\t"
2267  "movq %%mm2, %%mm3 \n\t"
2268  "psrlw $8, %%mm0 \n\t"
2269  "psrlw $8, %%mm2 \n\t"
2270  "pand %%mm7, %%mm1 \n\t"
2271  "pand %%mm7, %%mm3 \n\t"
2272  "packuswb %%mm2, %%mm0 \n\t"
2273  "packuswb %%mm3, %%mm1 \n\t"
2274  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2275  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2276  "add $8, %0 \n\t"
2277  " js 1b \n\t"
2278  : "+r"(count)
2279  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2280  );
2281  count -= 7;
2282  }
2283 #endif
2284  while(count<0) {
2285  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2286  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2287  count++;
2288  }
2289 }
2290 
2291 #if !COMPILE_TEMPLATE_AMD3DNOW
2292 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2293 {
2294  dst0+= count;
2295  dst1+= count;
2296  src += 4*count;
2297  count= - count;
2298  if(count <= -8) {
2299  count += 7;
2300  __asm__ volatile(
2301  "pcmpeqw %%mm7, %%mm7 \n\t"
2302  "psrlw $8, %%mm7 \n\t"
2303  "1: \n\t"
2304  "movq -28(%1, %0, 4), %%mm0 \n\t"
2305  "movq -20(%1, %0, 4), %%mm1 \n\t"
2306  "movq -12(%1, %0, 4), %%mm2 \n\t"
2307  "movq -4(%1, %0, 4), %%mm3 \n\t"
2308  "psrlw $8, %%mm0 \n\t"
2309  "psrlw $8, %%mm1 \n\t"
2310  "psrlw $8, %%mm2 \n\t"
2311  "psrlw $8, %%mm3 \n\t"
2312  "packuswb %%mm1, %%mm0 \n\t"
2313  "packuswb %%mm3, %%mm2 \n\t"
2314  "movq %%mm0, %%mm1 \n\t"
2315  "movq %%mm2, %%mm3 \n\t"
2316  "psrlw $8, %%mm0 \n\t"
2317  "psrlw $8, %%mm2 \n\t"
2318  "pand %%mm7, %%mm1 \n\t"
2319  "pand %%mm7, %%mm3 \n\t"
2320  "packuswb %%mm2, %%mm0 \n\t"
2321  "packuswb %%mm3, %%mm1 \n\t"
2322  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2323  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2324  "add $8, %0 \n\t"
2325  " js 1b \n\t"
2326  : "+r"(count)
2327  : "r"(src), "r"(dst0), "r"(dst1)
2328  );
2329  count -= 7;
2330  }
2331  src++;
2332  while(count<0) {
2333  dst0[count]= src[4*count+0];
2334  dst1[count]= src[4*count+2];
2335  count++;
2336  }
2337 }
2338 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2339 
2340 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2341 {
2342  dst0 += count;
2343  dst1 += count;
2344  src0 += 4*count;
2345  src1 += 4*count;
2346  count= - count;
2347 #ifdef PAVGB
2348  if(count <= -8) {
2349  count += 7;
2350  __asm__ volatile(
2351  "pcmpeqw %%mm7, %%mm7 \n\t"
2352  "psrlw $8, %%mm7 \n\t"
2353  "1: \n\t"
2354  "movq -28(%1, %0, 4), %%mm0 \n\t"
2355  "movq -20(%1, %0, 4), %%mm1 \n\t"
2356  "movq -12(%1, %0, 4), %%mm2 \n\t"
2357  "movq -4(%1, %0, 4), %%mm3 \n\t"
2358  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2359  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2360  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2361  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2362  "psrlw $8, %%mm0 \n\t"
2363  "psrlw $8, %%mm1 \n\t"
2364  "psrlw $8, %%mm2 \n\t"
2365  "psrlw $8, %%mm3 \n\t"
2366  "packuswb %%mm1, %%mm0 \n\t"
2367  "packuswb %%mm3, %%mm2 \n\t"
2368  "movq %%mm0, %%mm1 \n\t"
2369  "movq %%mm2, %%mm3 \n\t"
2370  "psrlw $8, %%mm0 \n\t"
2371  "psrlw $8, %%mm2 \n\t"
2372  "pand %%mm7, %%mm1 \n\t"
2373  "pand %%mm7, %%mm3 \n\t"
2374  "packuswb %%mm2, %%mm0 \n\t"
2375  "packuswb %%mm3, %%mm1 \n\t"
2376  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2377  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2378  "add $8, %0 \n\t"
2379  " js 1b \n\t"
2380  : "+r"(count)
2381  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2382  );
2383  count -= 7;
2384  }
2385 #endif
2386  src0++;
2387  src1++;
2388  while(count<0) {
2389  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2390  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2391  count++;
2392  }
2393 }
2394 
2395 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2396  int width, int height,
2397  int lumStride, int chromStride, int srcStride)
2398 {
2399  int y;
2400  const int chromWidth= -((-width)>>1);
2401 
2402  for (y=0; y<height; y++) {
2403  RENAME(extract_even)(src, ydst, width);
2404  if(y&1) {
2405  RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2406  udst+= chromStride;
2407  vdst+= chromStride;
2408  }
2409 
2410  src += srcStride;
2411  ydst+= lumStride;
2412  }
2413  __asm__(
2414  EMMS" \n\t"
2415  SFENCE" \n\t"
2416  ::: "memory"
2417  );
2418 }
2419 
2420 #if !COMPILE_TEMPLATE_AMD3DNOW
2421 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2422  int width, int height,
2423  int lumStride, int chromStride, int srcStride)
2424 {
2425  int y;
2426  const int chromWidth= -((-width)>>1);
2427 
2428  for (y=0; y<height; y++) {
2429  RENAME(extract_even)(src, ydst, width);
2430  RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2431 
2432  src += srcStride;
2433  ydst+= lumStride;
2434  udst+= chromStride;
2435  vdst+= chromStride;
2436  }
2437  __asm__(
2438  EMMS" \n\t"
2439  SFENCE" \n\t"
2440  ::: "memory"
2441  );
2442 }
2443 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2444 
2445 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2446  int width, int height,
2447  int lumStride, int chromStride, int srcStride)
2448 {
2449  int y;
2450  const int chromWidth= -((-width)>>1);
2451 
2452  for (y=0; y<height; y++) {
2453  RENAME(extract_even)(src+1, ydst, width);
2454  if(y&1) {
2455  RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2456  udst+= chromStride;
2457  vdst+= chromStride;
2458  }
2459 
2460  src += srcStride;
2461  ydst+= lumStride;
2462  }
2463  __asm__(
2464  EMMS" \n\t"
2465  SFENCE" \n\t"
2466  ::: "memory"
2467  );
2468 }
2469 
2470 #if !COMPILE_TEMPLATE_AMD3DNOW
2471 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2472  int width, int height,
2473  int lumStride, int chromStride, int srcStride)
2474 {
2475  int y;
2476  const int chromWidth= -((-width)>>1);
2477 
2478  for (y=0; y<height; y++) {
2479  RENAME(extract_even)(src+1, ydst, width);
2480  RENAME(extract_even2)(src, udst, vdst, chromWidth);
2481 
2482  src += srcStride;
2483  ydst+= lumStride;
2484  udst+= chromStride;
2485  vdst+= chromStride;
2486  }
2487  __asm__(
2488  EMMS" \n\t"
2489  SFENCE" \n\t"
2490  ::: "memory"
2491  );
2492 }
2493 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2494 #endif /* !COMPILE_TEMPLATE_SSE2 */
2495 
2497 {
2498 #if !COMPILE_TEMPLATE_SSE2
2499 #if !COMPILE_TEMPLATE_AMD3DNOW
2527 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2528 
2529 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2531 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2533 
2536 #endif /* !COMPILE_TEMPLATE_SSE2 */
2537 
2538 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2540 #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
2541 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
2543 #endif
2544 }
static void RENAME() rgb32tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() vu9_to_vu12(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() rgb16tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
#define RENAME(a)
static void RENAME() rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
Definition: rgb2rgb.c:80
static void RENAME() rgb32tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb15to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Macro definitions for various function/variable attributes.
uint8_t
#define av_cold
Definition: attributes.h:66
int x86_reg
Definition: asm.h:70
static void RENAME() yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
#define b
Definition: input.c:52
void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
#define PREFETCH
#define MANGLE(a)
Definition: asm.h:110
static void RENAME() rgb24tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
#define r
Definition: input.c:51
static void RENAME() uyvytoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() shuffle_bytes_2103(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12touyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16 (If this is a problem for anyon...
g
Definition: yuv2rgb.c:535
static av_cold void RENAME() rgb2rgb_init(void)
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
#define PACK_RGB32
#define MOVNTQ
static void RENAME() rgb24to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb16to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() yuv422ptouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
NULL
Definition: eval.c:55
static int width
Definition: utils.c:156
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb16to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void(WINAPI *cond_broadcast)(pthread_cond_t *cond)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
static void RENAME() rgb32tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
int height
Definition: gxfenc.c:72
static void RENAME() yuyvtoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
#define EMMS
static void RENAME() yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
#define STORE_BGR24_MMX
#define SFENCE
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
Definition: rgb2rgb.c:85
static void RENAME() rgb24to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() interleaveBytes(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride)