1 /*
2 * dct.cc --
3 *
4 * DCT code, also contains MMX version
5 *
6 * Copyright (c) 1994-2002 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are met:
11 *
12 * A. Redistributions of source code must retain the above copyright notice,
13 * this list of conditions and the following disclaimer.
14 * B. Redistributions in binary form must reproduce the above copyright notice,
15 * this list of conditions and the following disclaimer in the documentation
16 * and/or other materials provided with the distribution.
17 * C. Neither the names of the copyright holders nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
22 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifndef lint
35 static const char rcsid[] =
36 "@(#) $Header: /usr/mash/src/repository/mash/mash-1/codec/dct.cc,v 1.16 2003/11/19 19:20:17 aswan Exp $";
37 #endif
38
39 #include <sys/types.h>
40 #include "misc/bsd-endian.h"
41 #include "codec/dct.h"
42 #include <stdio.h>
43
44 /* conditional declaration */
45 #if MMX_DCT_ENABLED
46 void domidct8x8llmW(short *inptr, short *quantptr, int *wsptr,
47 u_char *outptr, int stride);
48 #endif
49
50 /*
51 * Macros for fix-point (integer) arithmetic. FP_NBITS gives the number
52 * of binary digits past the decimal point. FP_MUL computes the product
53 * of two fixed point numbers. A fixed point number and an integer
54 * can be directly multiplied to give a fixed point number. FP_SCALE
55 * converts a floating point number to fixed point (and is used only
56 * at startup, not by the dct engine). FP_NORM converts a fixed
57 * point number to scalar by rounding to the closest integer.
58 * FP_JNORM is similar except it folds the jpeg bias of 128 into the
59 * rounding addition.
60 */
61 #define FP_NBITS 15
62 #define FP_MUL(a, b) ((((a) >> 5) * ((b) >> 5)) >> (FP_NBITS - 10))
63 #define FP_SCALE(v) (int)((double)(v) * double(1 << FP_NBITS) + 0.5)
64 #define FP_NORM(v) (((v) + (1 << (FP_NBITS-1))) >> FP_NBITS)
65 #define FP_JNORM(v) (((v) + (257 << (FP_NBITS-1))) >> FP_NBITS)
66
67 #define M(n) ((m0 >> (n)) & 1)
68
69 /*
70 * This macro stolen from nv.
71 */
72 /* Sick little macro which will limit x to [0..255] with logical ops */
73 #define LIMIT8(x, t) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))
74 #define LIMIT(x, t) (LIMIT8((x), t) & 0xff)
75
76 /* row order */
77 const u_char ROWZAG[] = {
78 0, 1, 8, 16, 9, 2, 3, 10,
79 17, 24, 32, 25, 18, 11, 4, 5,
80 12, 19, 26, 33, 40, 48, 41, 34,
81 27, 20, 13, 6, 7, 14, 21, 28,
82 35, 42, 49, 56, 57, 50, 43, 36,
83 29, 22, 15, 23, 30, 37, 44, 51,
84 58, 59, 52, 45, 38, 31, 39, 46,
85 53, 60, 61, 54, 47, 55, 62, 63,
86 0, 0, 0, 0, 0, 0, 0, 0,
87 0, 0, 0, 0, 0, 0, 0, 0
88 };
89 /* column order */
90 const u_char COLZAG[] = {
91 0, 8, 1, 2, 9, 16, 24, 17,
92 10, 3, 4, 11, 18, 25, 32, 40,
93 33, 26, 19, 12, 5, 6, 13, 20,
94 27, 34, 41, 48, 56, 49, 42, 35,
95 28, 21, 14, 7, 15, 22, 29, 36,
96 43, 50, 57, 58, 51, 44, 37, 30,
97 23, 31, 38, 45, 52, 59, 60, 53,
98 46, 39, 47, 54, 61, 62, 55, 63,
99 0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0
101 };
102
103 #define A1 FP_SCALE(0.7071068)
104 #define A2 FP_SCALE(0.5411961)
105 #define A3 A1
106 #define A4 FP_SCALE(1.3065630)
107 #define A5 FP_SCALE(0.3826834)
108
109 #define FA1 (0.707106781f)
110 #define FA2 (0.541196100f)
111 #define FA3 FA1
112 #define FA4 (1.306562965f)
113 #define FA5 (0.382683433f)
114
115 /*
116 * these magic numbers are scaling factors for each coef of the 1-d
117 * AA&N DCT. The scale factor for coef 0 is 1 and coef 1<=n<=7 is
118 * cos(n*PI/16)*sqrt(2). There is also a normalization of sqrt(8).
119 * Formally you divide by the scale factor but we multiply by the
120 * inverse because it's faster. So the numbers below are the inverse
121 * of what was just described.
122 */
123 #define B0 0.35355339059327376220
124 #define B1 0.25489778955207958447
125 #define B2 0.27059805007309849220
126 #define B3 0.30067244346752264027
127 #define B4 0.35355339059327376220
128 #define B5 0.44998811156820785231
129 #define B6 0.65328148243818826392
130 #define B7 1.28145772387075308943
131
132 /*
133 * Output multipliers for AA&N DCT
134 * (i.e., first stage multipliers for inverse DCT).
135 */
136 static const double first_stage[8] = { B0, B1, B2, B3, B4, B5, B6, B7, };
137
138 #ifdef _MSC_VER
139 // 'initializing' : truncation from 'const double to float
140 #pragma warning(disable: 4305)
141 #endif
142
143 /*
144 * The first_stage array crossed with itself. This allows us
145 * to embed the first stage multipliers of the row pass by
146 * computing scaled versions of the columns.
147 */
148 static const int cross_stage[64] = {
149 FP_SCALE(B0 * B0),
150 FP_SCALE(B0 * B1),
151 FP_SCALE(B0 * B2),
152 FP_SCALE(B0 * B3),
153 FP_SCALE(B0 * B4),
154 FP_SCALE(B0 * B5),
155 FP_SCALE(B0 * B6),
156 FP_SCALE(B0 * B7),
157
158 FP_SCALE(B1 * B0),
159 FP_SCALE(B1 * B1),
160 FP_SCALE(B1 * B2),
161 FP_SCALE(B1 * B3),
162 FP_SCALE(B1 * B4),
163 FP_SCALE(B1 * B5),
164 FP_SCALE(B1 * B6),
165 FP_SCALE(B1 * B7),
166
167 FP_SCALE(B2 * B0),
168 FP_SCALE(B2 * B1),
169 FP_SCALE(B2 * B2),
170 FP_SCALE(B2 * B3),
171 FP_SCALE(B2 * B4),
172 FP_SCALE(B2 * B5),
173 FP_SCALE(B2 * B6),
174 FP_SCALE(B2 * B7),
175
176 FP_SCALE(B3 * B0),
177 FP_SCALE(B3 * B1),
178 FP_SCALE(B3 * B2),
179 FP_SCALE(B3 * B3),
180 FP_SCALE(B3 * B4),
181 FP_SCALE(B3 * B5),
182 FP_SCALE(B3 * B6),
183 FP_SCALE(B3 * B7),
184
185 FP_SCALE(B4 * B0),
186 FP_SCALE(B4 * B1),
187 FP_SCALE(B4 * B2),
188 FP_SCALE(B4 * B3),
189 FP_SCALE(B4 * B4),
190 FP_SCALE(B4 * B5),
191 FP_SCALE(B4 * B6),
192 FP_SCALE(B4 * B7),
193
194 FP_SCALE(B5 * B0),
195 FP_SCALE(B5 * B1),
196 FP_SCALE(B5 * B2),
197 FP_SCALE(B5 * B3),
198 FP_SCALE(B5 * B4),
199 FP_SCALE(B5 * B5),
200 FP_SCALE(B5 * B6),
201 FP_SCALE(B5 * B7),
202
203 FP_SCALE(B6 * B0),
204 FP_SCALE(B6 * B1),
205 FP_SCALE(B6 * B2),
206 FP_SCALE(B6 * B3),
207 FP_SCALE(B6 * B4),
208 FP_SCALE(B6 * B5),
209 FP_SCALE(B6 * B6),
210 FP_SCALE(B6 * B7),
211
212 FP_SCALE(B7 * B0),
213 FP_SCALE(B7 * B1),
214 FP_SCALE(B7 * B2),
215 FP_SCALE(B7 * B3),
216 FP_SCALE(B7 * B4),
217 FP_SCALE(B7 * B5),
218 FP_SCALE(B7 * B6),
219 FP_SCALE(B7 * B7),
220 };
221 static const float f_cross_stage[64] = {
222 B0 * B0,
223 B0 * B1,
224 B0 * B2,
225 B0 * B3,
226 B0 * B4,
227 B0 * B5,
228 B0 * B6,
229 B0 * B7,
230
231 B1 * B0,
232 B1 * B1,
233 B1 * B2,
234 B1 * B3,
235 B1 * B4,
236 B1 * B5,
237 B1 * B6,
238 B1 * B7,
239
240 B2 * B0,
241 B2 * B1,
242 B2 * B2,
243 B2 * B3,
244 B2 * B4,
245 B2 * B5,
246 B2 * B6,
247 B2 * B7,
248
249 B3 * B0,
250 B3 * B1,
251 B3 * B2,
252 B3 * B3,
253 B3 * B4,
254 B3 * B5,
255 B3 * B6,
256 B3 * B7,
257
258 B4 * B0,
259 B4 * B1,
260 B4 * B2,
261 B4 * B3,
262 B4 * B4,
263 B4 * B5,
264 B4 * B6,
265 B4 * B7,
266
267 B5 * B0,
268 B5 * B1,
269 B5 * B2,
270 B5 * B3,
271 B5 * B4,
272 B5 * B5,
273 B5 * B6,
274 B5 * B7,
275
276 B6 * B0,
277 B6 * B1,
278 B6 * B2,
279 B6 * B3,
280 B6 * B4,
281 B6 * B5,
282 B6 * B6,
283 B6 * B7,
284
285 B7 * B0,
286 B7 * B1,
287 B7 * B2,
288 B7 * B3,
289 B7 * B4,
290 B7 * B5,
291 B7 * B6,
292 B7 * B7,
293 };
294
295 /*
296 * Map a quantization table in natural, row-order,
297 * into the qt input expected by rdct().
298 */
299 void
300 rdct_fold_q(const int* in, int* out)
301 {
302 #if !MMX_DCT_ENABLED
303 for (int i = 0; i < 64; ++i) {
304 /*
305 * Fold column and row passes of the dct.
306 * By scaling each column DCT independently,
307 * we pre-bias all the row DCT's so the
308 * first multiplier is already embedded
309 * in the temporary result. Thanks to
310 * Martin Vetterli for explaining how
311 * to do this.
312 */
313 double v = double(in[i]);
314 v *= first_stage[i & 7];
315 v *= first_stage[i >> 3];
316 out[i] = FP_SCALE(v);
317 }
318 #else
319 /* the MMX version of rdct() expects the quantization
320 * table to be an array of short */
321 for (int i = 0; i < 64; i++)
322 out[i] = in[i];
323 #endif
324 }
325
326 /*
327 * Just like rdct_fold_q() but we divide by the quantizer.
328 */
329 void
330 fdct_fold_q(const int* in, float* out)
331 {
332 for (int i = 0; i < 64; ++i) {
333 double v = first_stage[i >> 3];
334 v *= first_stage[i & 7];
335 double q = double(in[i]);
336 out[i] = (float) (v / q);
337 }
338 }
339
340 void dcsum(int dc, u_char* in, u_char* out, int stride)
341 {
342 for (int k = 8; --k >= 0; ) {
343 int t;
344 #ifdef INT_64
345 /*FIXME assume little-endian */
346 INT_64 i = *(INT_64*)in;
347 INT_64 o = (INT_64)LIMIT(dc + (i >> 56), t) << 56;
348 o |= (INT_64)LIMIT(dc + (i >> 48 & 0xff), t) << 48;
349 o |= (INT_64)LIMIT(dc + (i >> 40 & 0xff), t) << 40;
350 o |= (INT_64)LIMIT(dc + (i >> 32 & 0xff), t) << 32;
351 o |= (INT_64)LIMIT(dc + (i >> 24 & 0xff), t) << 24;
352 o |= (INT_64)LIMIT(dc + (i >> 16 & 0xff), t) << 16;
353 o |= (INT_64)LIMIT(dc + (i >> 8 & 0xff), t) << 8;
354 o |= (INT_64)LIMIT(dc + (i & 0xff), t);
355 *(INT_64*)out = o;
356 #else
357 u_int o = 0;
358 u_int i = *(u_int*)in;
359 SPLICE(o, LIMIT(dc + EXTRACT(i, 24), t), 24);
360 SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
361 SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
362 SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
363 *(u_int*)out = o;
364
365 o = 0;
366 i = *(u_int*)(in + 4);
367 SPLICE(o, LIMIT(dc + EXTRACT(i, 24), t), 24);
368 SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
369 SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
370 SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
371 *(u_int*)(out + 4) = o;
372 #endif
373 in += stride;
374 out += stride;
375 }
376 }
377
378 void dcsum2(int dc, u_char* in, u_char* out, int stride)
379 {
380 for (int k = 8; --k >= 0; ) {
381 int t;
382 u_int o = 0;
383 SPLICE(o, LIMIT(dc + in[0], t), 24);
384 SPLICE(o, LIMIT(dc + in[1], t), 16);
385 SPLICE(o, LIMIT(dc + in[2], t), 8);
386 SPLICE(o, LIMIT(dc + in[3], t), 0);
387 *(u_int*)out = o;
388
389 o = 0;
390 SPLICE(o, LIMIT(dc + in[4], t), 24);
391 SPLICE(o, LIMIT(dc + in[5], t), 16);
392 SPLICE(o, LIMIT(dc + in[6], t), 8);
393 SPLICE(o, LIMIT(dc + in[7], t), 0);
394 *(u_int*)(out + 4) = o;
395
396 in += stride;
397 out += stride;
398 }
399 }
400
401 void dcfill(int DC, u_char* out, int stride)
402 {
403 int t;
404 u_int dc = DC;
405 dc = LIMIT(dc, t);
406 dc |= dc << 8;
407 dc |= dc << 16;
408 #ifdef INT_64
409 INT_64 xdc = dc;
410 xdc |= xdc << 32;
411 *(INT_64 *)out = xdc;
412 out += stride;
413 *(INT_64 *)out = xdc;
414 out += stride;
415 *(INT_64 *)out = xdc;
416 out += stride;
417 *(INT_64 *)out = xdc;
418 out += stride;
419 *(INT_64 *)out = xdc;
420 out += stride;
421 *(INT_64 *)out = xdc;
422 out += stride;
423 *(INT_64 *)out = xdc;
424 out += stride;
425 *(INT_64 *)out = xdc;
426 #else
427 *(u_int*)out = dc;
428 *(u_int*)(out + 4) = dc;
429 out += stride;
430 *(u_int*)out = dc;
431 *(u_int*)(out + 4) = dc;
432 out += stride;
433 *(u_int*)out = dc;
434 *(u_int*)(out + 4) = dc;
435 out += stride;
436 *(u_int*)out = dc;
437 *(u_int*)(out + 4) = dc;
438 out += stride;
439 *(u_int*)out = dc;
440 *(u_int*)(out + 4) = dc;
441 out += stride;
442 *(u_int*)out = dc;
443 *(u_int*)(out + 4) = dc;
444 out += stride;
445 *(u_int*)out = dc;
446 *(u_int*)(out + 4) = dc;
447 out += stride;
448 *(u_int*)out = dc;
449 *(u_int*)(out + 4) = dc;
450 #endif
451 }
452
453 /*
454 * This routine mixes the DC & AC components of an 8x8 block of
455 * pixels. This routine is called for every block decoded so it
456 * needs to be efficient. It tries to do as many pixels in parallel
457 * as will fit in a word. The one complication is that it has to
458 * deal with overflow (sum > 255) and underflow (sum < 0). Underflow
459 * & overflow are only possible if both terms have the same sign and
460 * are indicated by the result having a different sign than the terms.
461 * Note that underflow is more worrisome than overflow since it results
462 * in bright white dots in a black field.
463 * The DC term and sum are biased by 128 so a negative number has the
464 * 2^7 bit = 0. The AC term is not biased so a negative number has
465 * the 2^7 bit = 1. So underflow is indicated by (DC & AC & sum) != 0;
466 */
467 #define MIX_LOGIC(sum, a, b, omask, uflo) \
468 { \
469 sum = a + b; \
470 uflo = (a ^ b) & (a ^ sum) & omask; \
471 if (uflo) { \
472 if ((b = uflo & a) != 0) { \
473 /* integer overflows */ \
474 b |= b >> 1; \
475 b |= b >> 2; \
476 b |= b >> 4; \
477 sum |= b; \
478 } \
479 if ((uflo &=~ b) != 0) { \
480 /* integer underflow(s) */ \
481 uflo |= uflo >> 1; \
482 uflo |= uflo >> 2; \
483 uflo |= uflo >> 4; \
484 sum &= ~uflo; \
485 } \
486 } \
487 }
488 /*
489 * Table of products of 8-bit scaled coefficients
490 * and idct coefficients (there are only 33 unique
491 * coefficients so we index via a compact ID).
492 */
493 extern "C" u_char multab[];
494 /*
495 * Array of coefficient ID's used to index multab.
496 */
497 extern "C" u_int dct_basis[64][64 / sizeof(u_int)];
498
499 /*FIXME*/
500 #define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))
501
502 #ifdef INT_64
503 /*FIXME assume little-endian */
504 #define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)
505 #define DID4PIX
506 #define PSTORE ((INT_64*)p)[0] = pix
507 #define PIXDEF INT_64 pix = 0; int v, oflo = 0
508 #else
509 #define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)
510 #define DID4PIX pix0 = pix; pix = 0
511 #define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix
512 #define PIXDEF u_int pix0, pix = 0; int v, oflo = 0
513 #endif
514 #define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)
515 #define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)
516 #define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)
517 #define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)
518 #define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)
519 #define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)
520
521 #if MMX_DCT_ENABLED
522 /* this code invokes mmx version of rdct. This routine uses the algorithm
523 * described by C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast
524 * 1-D DCT Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
525 * Speech, and Signal Processing 1989. The algorithm itself is slower but more
526 * accurate than the AAN algorithm used in the standard version of this routine.
527 */
528 char *output_buf[8]; // array of pointers to short
529 char output_space[8][8]; // space that assembly code can write to
530
531 void
532 #ifdef INT_64
533 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
534 #else
535 rdct(register short *bp, u_int m0, u_int m1, u_char* p,
536 int stride, const int* qt)
537 #endif
538 {
539
540 int workspace[68];
541 short bpt[8][8];
542 short qtmx[64];
543 for (int i = 0; i < 64; i++) {
544 qtmx[i] = (short)qt[i];
545 bpt[i & 7][i >> 3] = *(bp + i);
546 }
547 domidct8x8llmW(&bpt[0][0], qtmx, workspace, p, stride);
548
549 }
550
551 #else
552
553 /* The following code is the standard implementation of the DCT */
554
555 /*
556 * A 2D Inverse DCT based on a column-row decomposition using
557 * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8
558 * Pennebaker & Mitchell (i.e., the pink JPEG book). This figure
559 * is the forward transform; reverse the flowgraph for the inverse
560 * (you need to draw a picture). The outputs are DFT coefficients
561 * and need to be scaled according to Eq [4-3].
562 *
563 * The input coefficients are, counter to tradition, in column-order.
564 * The bit mask indicates which coefficients are non-zero. If the
565 * corresponding bit is zero, then the coefficient is assumed zero
566 * and the input coefficient is not referenced and need not be defined.
567 * The 8-bit outputs are computed in row order and placed in the
568 * output array pointed to by p, with each of the eight 8-byte lines
569 * offset by "stride" bytes.
570 *
571 * qt is the inverse quantization table in column order. These
572 * coefficients are the product of the inverse quantization factor,
573 * specified by the jpeg quantization table, and the first multiplier
574 * in the inverse DCT flow graph. This first multiplier is actually
575 * biased by the row term so that the columns are pre-scaled to
576 * eliminate the first row multiplication stage.
577 *
578 * The output is biased by 128, i.e., [-128,127] is mapped to [0,255],
579 * which is relevant to jpeg.
580 */
581 void
582 #ifdef INT_64
583 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
584 #else
585 rdct(register short *bp, u_int m0, u_int m1, u_char* p,
586 int stride, const int* qt)
587 #endif
588 {
589 /*
590 * First pass is 1D transform over the columns. Note that
591 * the coefficients are stored in column order, so even
592 * though we're accessing the columns, we access them
593 * in a row-like fashion. We use a column-row decomposition
594 * instead of a row-column decomposition so we can splice
595 * pixels in an efficient, row-wise order in the second
596 * stage.
597 *
598 * The inverse quantization is folded together with the
599 * first stage multiplier. This reduces the number of
600 * multiplies (per 8-pt transform) from 11 to 5 (ignoring
601 * the inverse quantization which must be done anyway).
602 *
603 * Because we compute appropriately scaled column DCTs,
604 * the row DCTs require only 5 multiplies per row total.
605 * So the total number of multiplies for the 8x8 DCT is 80
606 * (ignoring the checks for zero coefficients).
607 */
608 int tmp[64];
609 int* tp = tmp;
610 int i;
611 for (i = 8; --i >= 0; ) {
612 if ((m0 & 0xfe) == 0) {
613 /* AC terms all zero */
614 int v = M(0) ? qt[0] * bp[0] : 0;
615 tp[0] = v;
616 tp[1] = v;
617 tp[2] = v;
618 tp[3] = v;
619 tp[4] = v;
620 tp[5] = v;
621 tp[6] = v;
622 tp[7] = v;
623 } else {
624 int t0, t1, t2, t3, t4, t5, t6, t7;
625
626 if ((m0 & 0xaa) == 0)
627 t4 = t5 = t6 = t7 = 0;
628 else {
629 t0 = M(5) ? qt[5] * bp[5] : 0;
630 t2 = M(1) ? qt[1] * bp[1] : 0;
631 t6 = M(7) ? qt[7] * bp[7] : 0;
632 t7 = M(3) ? qt[3] * bp[3] : 0;
633
634 t4 = t0 - t7;
635 t1 = t2 + t6;
636 t6 = t2 - t6;
637 t7 += t0;
638
639 t5 = t1 - t7;
640 t7 += t1;
641
642 t2 = FP_MUL(-A5, t4 + t6);
643 t4 = FP_MUL(-A2, t4);
644 t0 = t4 + t2;
645 t1 = FP_MUL(A3, t5);
646 t2 += FP_MUL(A4, t6);
647
648 t4 = -t0;
649 t5 = t1 - t0;
650 t6 = t1 + t2;
651 t7 += t2;
652 }
653
654 #ifdef notdef
655 if ((m0 & 0x55) == 0)
656 t0 = t1 = t2 = t3 = 0;
657 else {
658 #endif
659 t0 = M(0) ? qt[0] * bp[0] : 0;
660 t1 = M(4) ? qt[4] * bp[4] : 0;
661 int x0 = t0 + t1;
662 int x1 = t0 - t1;
663
664 t0 = M(2) ? qt[2] * bp[2] : 0;
665 t3 = M(6) ? qt[6] * bp[6] : 0;
666 t2 = t0 - t3;
667 t3 += t0;
668
669 t2 = FP_MUL(A1, t2);
670 t3 += t2;
671
672 t0 = x0 + t3;
673 t1 = x1 + t2;
674 t2 = x1 - t2;
675 t3 = x0 - t3;
676 #ifdef notdef
677 }
678 #endif
679
680 tp[0] = t0 + t7;
681 tp[7] = t0 - t7;
682 tp[1] = t1 + t6;
683 tp[6] = t1 - t6;
684 tp[2] = t2 + t5;
685 tp[5] = t2 - t5;
686 tp[3] = t3 + t4;
687 tp[4] = t3 - t4;
688 }
689 tp += 8;
690 bp += 8;
691 qt += 8;
692
693 m0 >>= 8;
694 #ifndef INT_64
695 m0 |= m1 << 24;
696 m1 >>= 8;
697 #endif
698 }
699 tp -= 64;
700 /*
701 * Second pass is 1D transform over the rows. Note that
702 * the coefficients are stored in column order, so even
703 * though we're accessing the rows, we access them
704 * in a column-like fashion.
705 *
706 * The pass above already computed the first multiplier
707 * in the flow graph.
708 */
709 for (i = 8; --i >= 0; ) {
710 int t0, t1, t2, t3, t4, t5, t6, t7;
711
712 t0 = tp[8*5];
713 t2 = tp[8*1];
714 t6 = tp[8*7];
715 t7 = tp[8*3];
716
717 #ifdef notdef
718 if ((t0 | t2 | t6 | t7) == 0) {
719 t4 = t5 = 0;
720 } else
721 #endif
722 {
723 t4 = t0 - t7;
724 t1 = t2 + t6;
725 t6 = t2 - t6;
726 t7 += t0;
727
728 t5 = t1 - t7;
729 t7 += t1;
730
731 t2 = FP_MUL(-A5, t4 + t6);
732 t4 = FP_MUL(-A2, t4);
733 t0 = t4 + t2;
734 t1 = FP_MUL(A3, t5);
735 t2 += FP_MUL(A4, t6);
736
737 t4 = -t0;
738 t5 = t1 - t0;
739 t6 = t1 + t2;
740 t7 += t2;
741 }
742
743 t0 = tp[8*0];
744 t1 = tp[8*4];
745 int x0 = t0 + t1;
746 int x1 = t0 - t1;
747
748 t0 = tp[8*2];
749 t3 = tp[8*6];
750 t2 = t0 - t3;
751 t3 += t0;
752
753 t2 = FP_MUL(A1, t2);
754 t3 += t2;
755
756 t0 = x0 + t3;
757 t1 = x1 + t2;
758 t2 = x1 - t2;
759 t3 = x0 - t3;
760
761 PIXDEF;
762 DOJPIX(t0 + t7, 0);
763 DOJPIX(t1 + t6, 1);
764 DOJPIX(t2 + t5, 2);
765 DOJPIX(t3 + t4, 3);
766 DID4PIX;
767 DOJPIX(t3 - t4, 4);
768 DOJPIX(t2 - t5, 5);
769 DOJPIX(t1 - t6, 6);
770 DOJPIX(t0 - t7, 7);
771 if (oflo & ~0xff) {
772 int t;
773 pix = 0;
774 DOJPIXLIMIT(t0 + t7, 0);
775 DOJPIXLIMIT(t1 + t6, 1);
776 DOJPIXLIMIT(t2 + t5, 2);
777 DOJPIXLIMIT(t3 + t4, 3);
778 DID4PIX;
779 DOJPIXLIMIT(t3 - t4, 4);
780 DOJPIXLIMIT(t2 - t5, 5);
781 DOJPIXLIMIT(t1 - t6, 6);
782 DOJPIXLIMIT(t0 - t7, 7);
783 }
784 PSTORE;
785
786 ++tp;
787 p += stride;
788 }
789
790 }
791 #endif
792
793 /*
794 * Inverse 2-D transform, similar to routine above (see comment above),
795 * but more appropriate for H.261 instead of JPEG. This routine does
796 * not bias the output by 128, and has an additional argument which is
797 * an input array which gets summed together with the inverse-transform.
798 * For example, this allows motion-compensation to be folded in here,
799 * saving an extra traversal of the block. The input pointer can be
800 * null, if motion-compensation is not needed.
801 *
802 * This routine does not take a quantization table, since the H.261
803 * inverse quantizer is easily implemented via table lookup in the decoder.
804 */
805 void
806 #ifdef INT_64
807 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const u_char* in)
808 #else
809 rdct(register short *bp, u_int m0, u_int m1, u_char* p, int stride, const u_char *in)
810 #endif
811 {
812 int tmp[64];
813 int* tp = tmp;
814 const int* qt = cross_stage;
815 /*
816 * First pass is 1D transform over the rows of the input array.
817 */
818 int i;
819 for (i = 8; --i >= 0; ) {
820 if ((m0 & 0xfe) == 0) {
821 /*
822 * All ac terms are zero.
823 */
824 int v = 0;
825 if (M(0))
826 v = qt[0] * bp[0];
827 tp[0] = v;
828 tp[1] = v;
829 tp[2] = v;
830 tp[3] = v;
831 tp[4] = v;
832 tp[5] = v;
833 tp[6] = v;
834 tp[7] = v;
835 } else {
836 int t4 = 0, t5 = 0, t6 = 0, t7 = 0;
837 if (m0 & 0xaa) {
838 /* odd part */
839 if (M(1))
840 t4 = qt[1] * bp[1];
841 if (M(3))
842 t5 = qt[3] * bp[3];
843 if (M(5))
844 t6 = qt[5] * bp[5];
845 if (M(7))
846 t7 = qt[7] * bp[7];
847
848 int x0 = t6 - t5;
849 t6 += t5;
850 int x1 = t4 - t7;
851 t7 += t4;
852
853 t5 = FP_MUL(t7 - t6, A3);
854 t7 += t6;
855
856 t4 = FP_MUL(x1 + x0, A5);
857 t6 = FP_MUL(x1, A4) - t4;
858 t4 += FP_MUL(x0, A2);
859
860 t7 += t6;
861 t6 += t5;
862 t5 += t4;
863 }
864 int t0 = 0, t1 = 0, t2 = 0, t3 = 0;
865 if (m0 & 0x55) {
866 /* even part */
867 if (M(0))
868 t0 = qt[0] * bp[0];
869 if (M(2))
870 t1 = qt[2] * bp[2];
871 if (M(4))
872 t2 = qt[4] * bp[4];
873 if (M(6))
874 t3 = qt[6] * bp[6];
875
876 int x0 = FP_MUL(t1 - t3, A1);
877 t3 += t1;
878 t1 = t0 - t2;
879 t0 += t2;
880 t2 = t3 + x0;
881 t3 = t0 - t2;
882 t0 += t2;
883 t2 = t1 - x0;
884 t1 += x0;
885 }
886 tp[0] = t0 + t7;
887 tp[1] = t1 + t6;
888 tp[2] = t2 + t5;
889 tp[3] = t3 + t4;
890 tp[4] = t3 - t4;
891 tp[5] = t2 - t5;
892 tp[6] = t1 - t6;
893 tp[7] = t0 - t7;
894 }
895 qt += 8;
896 tp += 8;
897 bp += 8;
898 m0 >>= 8;
899 #ifndef INT_64
900 m0 |= m1 << 24;
901 m1 >>= 8;
902 #endif
903 }
904 tp -= 64;
905 /*
906 * Second pass is 1D transform over the rows of the temp array.
907 */
908 for (i = 8; --i >= 0; ) {
909 int t4 = tp[8*1];
910 int t5 = tp[8*3];
911 int t6 = tp[8*5];
912 int t7 = tp[8*7];
913 if ((t4|t5|t6|t7) != 0) {
914 /* odd part */
915 int x0 = t6 - t5;
916 t6 += t5;
917 int x1 = t4 - t7;
918 t7 += t4;
919
920 t5 = FP_MUL(t7 - t6, A3);
921 t7 += t6;
922
923 t4 = FP_MUL(x1 + x0, A5);
924 t6 = FP_MUL(x1, A4) - t4;
925 t4 += FP_MUL(x0, A2);
926
927 t7 += t6;
928 t6 += t5;
929 t5 += t4;
930 }
931 int t0 = tp[8*0];
932 int t1 = tp[8*2];
933 int t2 = tp[8*4];
934 int t3 = tp[8*6];
935 if ((t0|t1|t2|t3) != 0) {
936 /* even part */
937 int x0 = FP_MUL(t1 - t3, A1);
938 t3 += t1;
939 t1 = t0 - t2;
940 t0 += t2;
941 t2 = t3 + x0;
942 t3 = t0 - t2;
943 t0 += t2;
944 t2 = t1 - x0;
945 t1 += x0;
946 }
947 if (in != 0) {
948 PIXDEF;
949 DOPIXIN(t0 + t7, 0);
950 DOPIXIN(t1 + t6, 1);
951 DOPIXIN(t2 + t5, 2);
952 DOPIXIN(t3 + t4, 3);
953 DID4PIX;
954 DOPIXIN(t3 - t4, 4);
955 DOPIXIN(t2 - t5, 5);
956 DOPIXIN(t1 - t6, 6);
957 DOPIXIN(t0 - t7, 7);
958 if (oflo & ~0xff) {
959 int t;
960 pix = 0;
961 DOPIXINLIMIT(t0 + t7, 0);
962 DOPIXINLIMIT(t1 + t6, 1);
963 DOPIXINLIMIT(t2 + t5, 2);
964 DOPIXINLIMIT(t3 + t4, 3);
965 DID4PIX;
966 DOPIXINLIMIT(t3 - t4, 4);
967 DOPIXINLIMIT(t2 - t5, 5);
968 DOPIXINLIMIT(t1 - t6, 6);
969 DOPIXINLIMIT(t0 - t7, 7);
970 }
971 PSTORE;
972 in += stride;
973 } else {
974 PIXDEF;
975 DOPIX(t0 + t7, 0);
976 DOPIX(t1 + t6, 1);
977 DOPIX(t2 + t5, 2);
978 DOPIX(t3 + t4, 3);
979 DID4PIX;
980 DOPIX(t3 - t4, 4);
981 DOPIX(t2 - t5, 5);
982 DOPIX(t1 - t6, 6);
983 DOPIX(t0 - t7, 7);
984 if (oflo & ~0xff) {
985 int t;
986 pix = 0;
987 DOPIXLIMIT(t0 + t7, 0);
988 DOPIXLIMIT(t1 + t6, 1);
989 DOPIXLIMIT(t2 + t5, 2);
990 DOPIXLIMIT(t3 + t4, 3);
991 DID4PIX;
992 DOPIXLIMIT(t3 - t4, 4);
993 DOPIXLIMIT(t2 - t5, 5);
994 DOPIXLIMIT(t1 - t6, 6);
995 DOPIXLIMIT(t0 - t7, 7);
996 }
997 PSTORE;
998 }
999 tp += 1;
1000 p += stride;
1001 }
1002 }
1003
1004 /*
1005 * This macro does the combined descale-and-quantize
1006 * multiply. It truncates rather than rounds to give
1007 * the behavior required for the h.261 deadband quantizer.
1008 */
1009 #define FWD_DandQ(v, iq) short((v) * qt[iq])
1010
1011 void fdct(const u_char* in, int stride, short* out, const float* qt)
1012 {
1013 float tmp[64];
1014 float* tp = tmp;
1015
1016 int i;
1017 for (i = 8; --i >= 0; ) {
1018 float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
1019 t0 = float(in[0] + in[7]);
1020 t7 = float(in[0] - in[7]);
1021 t1 = float(in[1] + in[6]);
1022 t6 = float(in[1] - in[6]);
1023 t2 = float(in[2] + in[5]);
1024 t5 = float(in[2] - in[5]);
1025 t3 = float(in[3] + in[4]);
1026 t4 = float(in[3] - in[4]);
1027
1028 /* even part */
1029 x0 = t0 + t3;
1030 x2 = t1 + t2;
1031 tp[8*0] = x0 + x2;
1032 tp[8*4] = x0 - x2;
1033
1034 x1 = t0 - t3;
1035 x3 = t1 - t2;
1036 t0 = (x1 + x3) * FA1;
1037 tp[8*2] = x1 + t0;
1038 tp[8*6] = x1 - t0;
1039
1040 /* odd part */
1041 x0 = t4 + t5;
1042 x1 = t5 + t6;
1043 x2 = t6 + t7;
1044
1045 t3 = x1 * FA1;
1046 t4 = t7 - t3;
1047
1048 t0 = (x0 - x2) * FA5;
1049 t1 = x0 * FA2 + t0;
1050 tp[8*3] = t4 - t1;
1051 tp[8*5] = t4 + t1;
1052
1053 t7 += t3;
1054 t2 = x2 * FA4 + t0;
1055 tp[8*1] = t7 + t2;
1056 tp[8*7] = t7 - t2;
1057
1058 in += stride;
1059 tp += 1;
1060 }
1061 tp -= 8;
1062
1063 for (i = 8; --i >= 0; ) {
1064 float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
1065 t0 = tp[0] + tp[7];
1066 t7 = tp[0] - tp[7];
1067 t1 = tp[1] + tp[6];
1068 t6 = tp[1] - tp[6];
1069 t2 = tp[2] + tp[5];
1070 t5 = tp[2] - tp[5];
1071 t3 = tp[3] + tp[4];
1072 t4 = tp[3] - tp[4];
1073
1074 /* even part */
1075 x0 = t0 + t3;
1076 x2 = t1 + t2;
1077 out[0] = FWD_DandQ(x0 + x2, 0);
1078 out[4] = FWD_DandQ(x0 - x2, 4);
1079
1080 x1 = t0 - t3;
1081 x3 = t1 - t2;
1082 t0 = (x1 + x3) * FA1;
1083 out[2] = FWD_DandQ(x1 + t0, 2);
1084 out[6] = FWD_DandQ(x1 - t0, 6);
1085
1086 /* odd part */
1087 x0 = t4 + t5;
1088 x1 = t5 + t6;
1089 x2 = t6 + t7;
1090
1091 t3 = x1 * FA1;
1092 t4 = t7 - t3;
1093
1094 t0 = (x0 - x2) * FA5;
1095 t1 = x0 * FA2 + t0;
1096 out[3] = FWD_DandQ(t4 - t1, 3);
1097 out[5] = FWD_DandQ(t4 + t1, 5);
1098
1099 t7 += t3;
1100 t2 = x2 * FA4 + t0;
1101 out[1] = FWD_DandQ(t7 + t2, 1);
1102 out[7] = FWD_DandQ(t7 - t2, 7);
1103
1104 out += 8;
1105 tp += 8;
1106 qt += 8;
1107 }
1108 }
1109
1110 /*
1111 * decimate the *rows* of the two input 8x8 DCT matrices into
1112 * a single output matrix. we decimate rows rather than
1113 * columns even though we want column decimation because
1114 * the DCTs are stored in column order.
1115 */
1116 void
1117 dct_decimate(const short* in0, const short* in1, short* o)
1118 {
1119 for (int k = 0; k < 8; ++k) {
1120 int x00 = in0[0];
1121 int x01 = in0[1];
1122 int x02 = in0[2];
1123 int x03 = in0[3];
1124 int x10 = in1[0];
1125 int x11 = in1[1];
1126 int x12 = in1[2];
1127 int x13 = in1[3];
1128 #define X_N 4
1129 #define X_5(v) ((v) << (X_N - 1))
1130 #define X_25(v) ((v) << (X_N - 2))
1131 #define X_125(v) ((v) << (X_N - 3))
1132 #define X_0625(v) ((v) << (X_N - 4))
1133 #define X_375(v) (X_25(v) + X_125(v))
1134 #define X_625(v) (X_5(v) + X_125(v))
1135 #define X_75(v) (X_5(v) + X_25(v))
1136 #define X_6875(v) (X_5(v) + X_125(v) + X_0625(v))
1137 #define X_1875(v) (X_125(v) + X_0625(v))
1138 #define X_NORM(v) ((v) >> X_N)
1139
1140 /*
1141 * 0.50000000 0.09011998 0.00000000 0.10630376
1142 * 0.50000000 0.09011998 0.00000000 0.10630376
1143 * 0.45306372 0.28832037 0.03732892 0.08667963
1144 * -0.45306372 0.11942621 0.10630376 -0.06764951
1145 * 0.00000000 0.49039264 0.17677670 0.00000000
1146 * 0.00000000 -0.49039264 -0.17677670 0.00000000
1147 * -0.15909482 0.34009707 0.38408888 0.05735049
1148 * 0.15909482 0.43576792 -0.09011998 -0.13845632
1149 * 0.00000000 -0.03732892 0.46193977 0.25663998
1150 * 0.00000000 -0.03732892 0.46193977 0.25663998
1151 * 0.10630376 -0.18235049 0.25663998 0.42361940
1152 * -0.10630376 -0.16332037 -0.45306372 -0.01587282
1153 * 0.00000000 0.00000000 -0.07322330 0.41573481
1154 * 0.00000000 0.00000000 0.07322330 -0.41573481
1155 * -0.09011998 0.13399123 -0.18766514 0.24442621
1156