Libav
|
00001 /* 00002 * Copyright (C) 2004 the ffmpeg project 00003 * 00004 * This file is part of FFmpeg. 00005 * 00006 * FFmpeg is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * FFmpeg is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with FFmpeg; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00026 #include "libavutil/x86_cpu.h" 00027 #include "libavcodec/dsputil.h" 00028 #include "dsputil_mmx.h" 00029 #include "vp3dsp_mmx.h" 00030 00031 extern const uint16_t ff_vp3_idct_data[]; 00032 00033 // this is off by one or two for some cases when filter_limit is greater than 63 00034 // in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 00035 // out: p1 in mm4, p2 in mm3 00036 #define VP3_LOOP_FILTER(flim) \ 00037 "movq %%mm6, %%mm7 \n\t" \ 00038 "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ 00039 "psrlw $3, %%mm7 \n\t" \ 00040 "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ 00041 "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ 00042 "pxor %%mm4, %%mm2 \n\t" \ 00043 "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ 00044 "movq %%mm2, %%mm5 \n\t" \ 00045 "paddb %%mm2, %%mm2 \n\t" \ 00046 "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ 00047 "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ 00048 "pcmpeqb %%mm0, %%mm0 \n\t" \ 00049 "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ 00050 "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ 00051 "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ 00052 "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ 00053 "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ 00054 "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ 00055 "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ 00056 "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ 00057 "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ 00058 "psubusb %%mm7, %%mm6 \n\t" \ 00059 "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ 00060 \ 00061 "movq "#flim", %%mm5 \n\t" \ 00062 "pminub %%mm5, %%mm6 \n\t" \ 00063 "pminub %%mm5, %%mm7 \n\t" \ 00064 "movq %%mm6, %%mm0 \n\t" \ 00065 "movq %%mm7, %%mm1 \n\t" \ 00066 "paddb %%mm6, %%mm6 \n\t" \ 00067 "paddb %%mm7, %%mm7 \n\t" \ 00068 "pminub %%mm5, %%mm6 \n\t" \ 00069 "pminub %%mm5, %%mm7 \n\t" \ 00070 "psubb %%mm0, %%mm6 \n\t" \ 00071 "psubb %%mm1, %%mm7 \n\t" \ 00072 "paddusb %%mm7, %%mm4 \n\t" \ 00073 "psubusb %%mm6, %%mm4 \n\t" \ 00074 "psubusb %%mm7, %%mm3 \n\t" \ 00075 "paddusb %%mm6, %%mm3 \n\t" 00076 00077 #define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ 00078 "movd "#mm", %0 \n\t" \ 00079 "movw %w0, -1"#dst0" \n\t" \ 00080 "psrlq $32, "#mm" \n\t" \ 00081 "shr $16, %0 \n\t" \ 00082 "movw %w0, -1"#dst1" \n\t" \ 00083 "movd "#mm", %0 \n\t" \ 00084 "movw %w0, -1"#dst2" \n\t" \ 00085 "shr $16, %0 \n\t" \ 00086 "movw %w0, -1"#dst3" \n\t" 00087 00088 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) 00089 { 00090 __asm__ volatile( 00091 "movq %0, %%mm6 \n\t" 00092 "movq %1, %%mm4 \n\t" 00093 "movq %2, %%mm2 \n\t" 00094 "movq %3, %%mm1 \n\t" 00095 00096 VP3_LOOP_FILTER(%4) 00097 00098 "movq %%mm4, %1 \n\t" 00099 "movq %%mm3, %2 \n\t" 00100 00101 : "+m" (*(uint64_t*)(src - 2*stride)), 00102 "+m" (*(uint64_t*)(src - 1*stride)), 00103 "+m" (*(uint64_t*)(src + 0*stride)), 00104 "+m" (*(uint64_t*)(src + 1*stride)) 00105 : "m"(*(uint64_t*)(bounding_values+129)) 00106 ); 00107 } 00108 00109 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) 00110 { 00111 x86_reg tmp; 00112 00113 __asm__ volatile( 00114 "movd -2(%1), %%mm6 \n\t" 00115 "movd -2(%1,%3), %%mm0 \n\t" 00116 "movd -2(%1,%3,2), %%mm1 \n\t" 00117 "movd -2(%1,%4), %%mm4 \n\t" 00118 00119 TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) 00120 VP3_LOOP_FILTER(%5) 00121 SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) 00122 00123 STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) 00124 STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) 00125 00126 : "=&r"(tmp) 00127 : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), 00128 "m"(*(uint64_t*)(bounding_values+129)) 00129 : "memory" 00130 ); 00131 } 00132 00133 /* from original comments: The Macro does IDct on 4 1-D Dcts */ 00134 #define BeginIDCT() \ 00135 "movq "I(3)", %%mm2 \n\t" \ 00136 "movq "C(3)", %%mm6 \n\t" \ 00137 "movq %%mm2, %%mm4 \n\t" \ 00138 "movq "J(5)", %%mm7 \n\t" \ 00139 "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ 00140 "movq "C(5)", %%mm1 \n\t" \ 00141 "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ 00142 "movq %%mm1, %%mm5 \n\t" \ 00143 "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ 00144 "movq "I(1)", %%mm3 \n\t" \ 00145 "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ 00146 "movq "C(1)", %%mm0 \n\t" \ 00147 "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ 00148 "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ 00149 "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ 00150 "movq "J(7)", %%mm1 \n\t" \ 00151 "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ 00152 "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ 00153 "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ 00154 "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ 00155 "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ 00156 "movq "C(7)", %%mm7 \n\t" \ 00157 "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ 00158 "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ 00159 "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ 00160 "movq "I(2)", %%mm2 \n\t" \ 00161 "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ 00162 "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ 00163 "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ 00164 "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ 00165 "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ 00166 "movq "J(6)", %%mm5 \n\t" \ 00167 "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ 00168 "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ 00169 "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ 00170 "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ 00171 "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ 00172 "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ 00173 "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ 00174 "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ 00175 "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ 00176 "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ 00177 "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ 00178 "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ 00179 "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ 00180 "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ 00181 "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ 00182 "movq "C(4)", %%mm4 \n\t" \ 00183 "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ 00184 "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ 00185 "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ 00186 "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ 00187 "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ 00188 "movq "I(0)", %%mm6 \n\t" \ 00189 "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ 00190 "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ 00191 "movq "J(4)", %%mm3 \n\t" \ 00192 "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ 00193 "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ 00194 "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ 00195 "movq %%mm6, %%mm0 \n\t" \ 00196 "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ 00197 "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ 00198 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ 00199 "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ 00200 "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ 00201 "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ 00202 "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ 00203 "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ 00204 "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ 00205 "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ 00206 "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ 00207 "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ 00208 "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ 00209 00210 /* RowIDCT gets ready to transpose */ 00211 #define RowIDCT() \ 00212 BeginIDCT() \ 00213 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ 00214 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ 00215 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ 00216 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ 00217 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ 00218 "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ 00219 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ 00220 "paddsw %%mm3, %%mm3 \n\t" \ 00221 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ 00222 "paddsw %%mm5, %%mm5 \n\t" \ 00223 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ 00224 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ 00225 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ 00226 "paddsw %%mm0, %%mm0 \n\t" \ 00227 "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ 00228 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ 00229 00230 /* Column IDCT normalizes and stores final results */ 00231 #define ColumnIDCT() \ 00232 BeginIDCT() \ 00233 "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ 00234 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ 00235 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ 00236 "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \ 00237 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ 00238 "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \ 00239 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ 00240 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ 00241 "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ 00242 "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ 00243 "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ 00244 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ 00245 "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ 00246 "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ 00247 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ 00248 "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \ 00249 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ 00250 "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \ 00251 "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ 00252 "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ 00253 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ 00254 "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \ 00255 "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ 00256 "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \ 00257 "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ 00258 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ 00259 "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ 00260 "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ 00261 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ 00262 "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \ 00263 "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ 00264 "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \ 00265 "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ 00266 "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ 00267 "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ 00268 00269 /* Following macro does two 4x4 transposes in place. 00270 00271 At entry (we assume): 00272 00273 r0 = a3 a2 a1 a0 00274 I(1) = b3 b2 b1 b0 00275 r2 = c3 c2 c1 c0 00276 r3 = d3 d2 d1 d0 00277 00278 r4 = e3 e2 e1 e0 00279 r5 = f3 f2 f1 f0 00280 r6 = g3 g2 g1 g0 00281 r7 = h3 h2 h1 h0 00282 00283 At exit, we have: 00284 00285 I(0) = d0 c0 b0 a0 00286 I(1) = d1 c1 b1 a1 00287 I(2) = d2 c2 b2 a2 00288 I(3) = d3 c3 b3 a3 00289 00290 J(4) = h0 g0 f0 e0 00291 J(5) = h1 g1 f1 e1 00292 J(6) = h2 g2 f2 e2 00293 J(7) = h3 g3 f3 e3 00294 00295 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 00296 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 00297 00298 Since r1 is free at entry, we calculate the Js first. */ 00299 #define Transpose() \ 00300 "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ 00301 "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ 00302 "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ 00303 "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ 00304 "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ 00305 "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ 00306 "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ 00307 "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ 00308 "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ 00309 "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ 00310 "movq %%mm4, "J(4)"\n\t" \ 00311 "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ 00312 "movq %%mm5, "J(5)"\n\t" \ 00313 "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ 00314 "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ 00315 "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ 00316 "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ 00317 "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ 00318 "movq %%mm6, "J(7)"\n\t" \ 00319 "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ 00320 "movq %%mm1, "J(6)"\n\t" \ 00321 "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ 00322 "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ 00323 "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ 00324 "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ 00325 "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ 00326 "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ 00327 "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ 00328 "movq %%mm0, "I(0)"\n\t" \ 00329 "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ 00330 "movq %%mm1, "I(1)"\n\t" \ 00331 "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ 00332 "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ 00333 "movq %%mm4, "I(3)"\n\t" \ 00334 "movq %%mm2, "I(2)"\n\t" 00335 00336 void ff_vp3_idct_mmx(int16_t *output_data) 00337 { 00338 /* eax = quantized input 00339 * ebx = dequantizer matrix 00340 * ecx = IDCT constants 00341 * M(I) = ecx + MaskOffset(0) + I * 8 00342 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 00343 * edx = output 00344 * r0..r7 = mm0..mm7 00345 */ 00346 00347 #define C(x) AV_STRINGIFY(16*(x-1))"(%1)" 00348 #define OC_8 "%2" 00349 00350 /* at this point, function has completed dequantization + dezigzag + 00351 * partial transposition; now do the idct itself */ 00352 #define I(x) AV_STRINGIFY(16* x )"(%0)" 00353 #define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" 00354 00355 __asm__ volatile ( 00356 RowIDCT() 00357 Transpose() 00358 00359 #undef I 00360 #undef J 00361 #define I(x) AV_STRINGIFY(16* x + 64)"(%0)" 00362 #define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" 00363 00364 RowIDCT() 00365 Transpose() 00366 00367 #undef I 00368 #undef J 00369 #define I(x) AV_STRINGIFY(16*x)"(%0)" 00370 #define J(x) AV_STRINGIFY(16*x)"(%0)" 00371 00372 ColumnIDCT() 00373 00374 #undef I 00375 #undef J 00376 #define I(x) AV_STRINGIFY(16*x + 8)"(%0)" 00377 #define J(x) AV_STRINGIFY(16*x + 8)"(%0)" 00378 00379 ColumnIDCT() 00380 :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) 00381 ); 00382 #undef I 00383 #undef J 00384 00385 } 00386 00387 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) 00388 { 00389 ff_vp3_idct_mmx(block); 00390 put_signed_pixels_clamped_mmx(block, dest, line_size); 00391 } 00392 00393 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) 00394 { 00395 ff_vp3_idct_mmx(block); 00396 add_pixels_clamped_mmx(block, dest, line_size); 00397 } 00398 00399 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) 00400 { 00401 int dc = block[0]; 00402 dc = (46341*dc)>>16; 00403 dc = (46341*dc + (8<<16))>>20; 00404 00405 __asm__ volatile( 00406 "movd %3, %%mm0 \n\t" 00407 "pshufw $0, %%mm0, %%mm0 \n\t" 00408 "pxor %%mm1, %%mm1 \n\t" 00409 "psubw %%mm0, %%mm1 \n\t" 00410 "packuswb %%mm0, %%mm0 \n\t" 00411 "packuswb %%mm1, %%mm1 \n\t" 00412 00413 #define DC_ADD \ 00414 "movq (%0), %%mm2 \n\t" \ 00415 "movq (%0,%1), %%mm3 \n\t" \ 00416 "paddusb %%mm0, %%mm2 \n\t" \ 00417 "movq (%0,%1,2), %%mm4 \n\t" \ 00418 "paddusb %%mm0, %%mm3 \n\t" \ 00419 "movq (%0,%2), %%mm5 \n\t" \ 00420 "paddusb %%mm0, %%mm4 \n\t" \ 00421 "paddusb %%mm0, %%mm5 \n\t" \ 00422 "psubusb %%mm1, %%mm2 \n\t" \ 00423 "psubusb %%mm1, %%mm3 \n\t" \ 00424 "movq %%mm2, (%0) \n\t" \ 00425 "psubusb %%mm1, %%mm4 \n\t" \ 00426 "movq %%mm3, (%0,%1) \n\t" \ 00427 "psubusb %%mm1, %%mm5 \n\t" \ 00428 "movq %%mm4, (%0,%1,2) \n\t" \ 00429 "movq %%mm5, (%0,%2) \n\t" 00430 00431 DC_ADD 00432 "lea (%0,%1,4), %0 \n\t" 00433 DC_ADD 00434 00435 : "+r"(dest) 00436 : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) 00437 ); 00438 }