~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Open Mash Cross Reference
mash/codec/dct.cc

Component: ~ [ mash ] ~ [ apps ] ~ [ gsm ] ~ [ lib ] ~ [ otcl ] ~ [ srm ] ~ [ tcl8.3 ] ~ [ tclcl ] ~ [ tk8.3 ] ~ [ tutorials ] ~

  1 /*
  2  * dct.cc --
  3  *
  4  *      DCT code, also contains MMX version
  5  *
  6  * Copyright (c) 1994-2002 The Regents of the University of California.
  7  * All rights reserved.
  8  *
  9  * Redistribution and use in source and binary forms, with or without
 10  * modification, are permitted provided that the following conditions are met:
 11  *
 12  * A. Redistributions of source code must retain the above copyright notice,
 13  *    this list of conditions and the following disclaimer.
 14  * B. Redistributions in binary form must reproduce the above copyright notice,
 15  *    this list of conditions and the following disclaimer in the documentation
 16  *    and/or other materials provided with the distribution.
 17  * C. Neither the names of the copyright holders nor the names of its
 18  *    contributors may be used to endorse or promote products derived from this
 19  *    software without specific prior written permission.
 20  *
 21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
 22  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 23  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
 25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 31  * POSSIBILITY OF SUCH DAMAGE.
 32  */
 33 
 34 #ifndef lint
 35 static const char rcsid[] =
 36     "@(#) $Header: /usr/mash/src/repository/mash/mash-1/codec/dct.cc,v 1.16 2003/11/19 19:20:17 aswan Exp $";
 37 #endif
 38 
 39 #include <sys/types.h>
 40 #include "misc/bsd-endian.h"
 41 #include "codec/dct.h"
 42 #include <stdio.h>
 43 
 44 /* conditional declaration */
 45 #if MMX_DCT_ENABLED
 46 void domidct8x8llmW(short *inptr, short *quantptr, int *wsptr,
 47                            u_char *outptr, int stride);
 48 #endif
 49 
 50 /*
 51  * Macros for fix-point (integer) arithmetic.  FP_NBITS gives the number
 52  * of binary digits past the decimal point.  FP_MUL computes the product
 53  * of two fixed point numbers.  A fixed point number and an integer
 54  * can be directly multiplied to give a fixed point number.  FP_SCALE
 55  * converts a floating point number to fixed point (and is used only
 56  * at startup, not by the dct engine).  FP_NORM converts a fixed
 57  * point number to scalar by rounding to the closest integer.
 58  * FP_JNORM is similar except it folds the jpeg bias of 128 into the
 59  * rounding addition.
 60  */
 61 #define FP_NBITS 15
 62 #define FP_MUL(a, b)    ((((a) >> 5) * ((b) >> 5)) >> (FP_NBITS - 10))
 63 #define FP_SCALE(v)     (int)((double)(v) * double(1 << FP_NBITS) + 0.5)
 64 #define FP_NORM(v)      (((v) + (1 << (FP_NBITS-1))) >> FP_NBITS)
 65 #define FP_JNORM(v)     (((v) + (257 << (FP_NBITS-1))) >> FP_NBITS)
 66 
 67 #define M(n) ((m0 >> (n)) & 1)
 68 
 69 /*
 70  * This macro stolen from nv.
 71  */
 72 /* Sick little macro which will limit x to [0..255] with logical ops */
 73 #define LIMIT8(x, t) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))
 74 #define LIMIT(x, t) (LIMIT8((x), t) & 0xff)
 75 
 76 /* row order */
 77 const u_char ROWZAG[] = {
 78         0,  1,  8, 16,  9,  2,  3, 10,
 79         17, 24, 32, 25, 18, 11,  4,  5,
 80         12, 19, 26, 33, 40, 48, 41, 34,
 81         27, 20, 13,  6,  7, 14, 21, 28,
 82         35, 42, 49, 56, 57, 50, 43, 36,
 83         29, 22, 15, 23, 30, 37, 44, 51,
 84         58, 59, 52, 45, 38, 31, 39, 46,
 85         53, 60, 61, 54, 47, 55, 62, 63,
 86         0,  0,  0,  0,  0,  0,  0,  0,
 87         0,  0,  0,  0,  0,  0,  0,  0
 88 };
 89 /* column order */
 90 const u_char COLZAG[] = {
 91         0, 8, 1, 2, 9, 16, 24, 17,
 92         10, 3, 4, 11, 18, 25, 32, 40,
 93         33, 26, 19, 12, 5, 6, 13, 20,
 94         27, 34, 41, 48, 56, 49, 42, 35,
 95         28, 21, 14, 7, 15, 22, 29, 36,
 96         43, 50, 57, 58, 51, 44, 37, 30,
 97         23, 31, 38, 45, 52, 59, 60, 53,
 98         46, 39, 47, 54, 61, 62, 55, 63,
 99         0,  0,  0,  0,  0,  0,  0,  0,
100         0,  0,  0,  0,  0,  0,  0,  0
101 };
102 
103 #define A1 FP_SCALE(0.7071068)
104 #define A2 FP_SCALE(0.5411961)
105 #define A3 A1
106 #define A4 FP_SCALE(1.3065630)
107 #define A5 FP_SCALE(0.3826834)
108 
109 #define FA1 (0.707106781f)
110 #define FA2 (0.541196100f)
111 #define FA3 FA1
112 #define FA4 (1.306562965f)
113 #define FA5 (0.382683433f)
114 
115 /*
116  * these magic numbers are scaling factors for each coef of the 1-d
117  * AA&N DCT.  The scale factor for coef 0 is 1 and coef 1<=n<=7 is
118  * cos(n*PI/16)*sqrt(2).  There is also a normalization of sqrt(8).
119  * Formally you divide by the scale factor but we multiply by the
120  * inverse because it's faster.  So the numbers below are the inverse
121  * of what was just described.
122  */
123 #define B0 0.35355339059327376220
124 #define B1 0.25489778955207958447
125 #define B2 0.27059805007309849220
126 #define B3 0.30067244346752264027
127 #define B4 0.35355339059327376220
128 #define B5 0.44998811156820785231
129 #define B6 0.65328148243818826392
130 #define B7 1.28145772387075308943
131 
132 /*
133  * Output multipliers for AA&N DCT
134  * (i.e., first stage multipliers for inverse DCT).
135  */
136 static const double first_stage[8] = { B0, B1, B2, B3, B4, B5, B6, B7, };
137 
138 #ifdef _MSC_VER
139 // 'initializing' : truncation from 'const double to float
140 #pragma warning(disable: 4305)
141 #endif
142 
143 /*
144  * The first_stage array crossed with itself.  This allows us
145  * to embed the first stage multipliers of the row pass by
146  * computing scaled versions of the columns.
147  */
148 static const int cross_stage[64] = {
149         FP_SCALE(B0 * B0),
150         FP_SCALE(B0 * B1),
151         FP_SCALE(B0 * B2),
152         FP_SCALE(B0 * B3),
153         FP_SCALE(B0 * B4),
154         FP_SCALE(B0 * B5),
155         FP_SCALE(B0 * B6),
156         FP_SCALE(B0 * B7),
157 
158         FP_SCALE(B1 * B0),
159         FP_SCALE(B1 * B1),
160         FP_SCALE(B1 * B2),
161         FP_SCALE(B1 * B3),
162         FP_SCALE(B1 * B4),
163         FP_SCALE(B1 * B5),
164         FP_SCALE(B1 * B6),
165         FP_SCALE(B1 * B7),
166 
167         FP_SCALE(B2 * B0),
168         FP_SCALE(B2 * B1),
169         FP_SCALE(B2 * B2),
170         FP_SCALE(B2 * B3),
171         FP_SCALE(B2 * B4),
172         FP_SCALE(B2 * B5),
173         FP_SCALE(B2 * B6),
174         FP_SCALE(B2 * B7),
175 
176         FP_SCALE(B3 * B0),
177         FP_SCALE(B3 * B1),
178         FP_SCALE(B3 * B2),
179         FP_SCALE(B3 * B3),
180         FP_SCALE(B3 * B4),
181         FP_SCALE(B3 * B5),
182         FP_SCALE(B3 * B6),
183         FP_SCALE(B3 * B7),
184 
185         FP_SCALE(B4 * B0),
186         FP_SCALE(B4 * B1),
187         FP_SCALE(B4 * B2),
188         FP_SCALE(B4 * B3),
189         FP_SCALE(B4 * B4),
190         FP_SCALE(B4 * B5),
191         FP_SCALE(B4 * B6),
192         FP_SCALE(B4 * B7),
193 
194         FP_SCALE(B5 * B0),
195         FP_SCALE(B5 * B1),
196         FP_SCALE(B5 * B2),
197         FP_SCALE(B5 * B3),
198         FP_SCALE(B5 * B4),
199         FP_SCALE(B5 * B5),
200         FP_SCALE(B5 * B6),
201         FP_SCALE(B5 * B7),
202 
203         FP_SCALE(B6 * B0),
204         FP_SCALE(B6 * B1),
205         FP_SCALE(B6 * B2),
206         FP_SCALE(B6 * B3),
207         FP_SCALE(B6 * B4),
208         FP_SCALE(B6 * B5),
209         FP_SCALE(B6 * B6),
210         FP_SCALE(B6 * B7),
211 
212         FP_SCALE(B7 * B0),
213         FP_SCALE(B7 * B1),
214         FP_SCALE(B7 * B2),
215         FP_SCALE(B7 * B3),
216         FP_SCALE(B7 * B4),
217         FP_SCALE(B7 * B5),
218         FP_SCALE(B7 * B6),
219         FP_SCALE(B7 * B7),
220 };
221 static const float f_cross_stage[64] = {
222     B0 * B0,
223     B0 * B1,
224     B0 * B2,
225     B0 * B3,
226     B0 * B4,
227     B0 * B5,
228     B0 * B6,
229     B0 * B7,
230 
231     B1 * B0,
232     B1 * B1,
233     B1 * B2,
234     B1 * B3,
235     B1 * B4,
236     B1 * B5,
237     B1 * B6,
238     B1 * B7,
239 
240     B2 * B0,
241     B2 * B1,
242     B2 * B2,
243     B2 * B3,
244     B2 * B4,
245     B2 * B5,
246     B2 * B6,
247     B2 * B7,
248 
249     B3 * B0,
250     B3 * B1,
251     B3 * B2,
252     B3 * B3,
253     B3 * B4,
254     B3 * B5,
255     B3 * B6,
256     B3 * B7,
257 
258     B4 * B0,
259     B4 * B1,
260     B4 * B2,
261     B4 * B3,
262     B4 * B4,
263     B4 * B5,
264     B4 * B6,
265     B4 * B7,
266 
267     B5 * B0,
268     B5 * B1,
269     B5 * B2,
270     B5 * B3,
271     B5 * B4,
272     B5 * B5,
273     B5 * B6,
274     B5 * B7,
275 
276     B6 * B0,
277     B6 * B1,
278     B6 * B2,
279     B6 * B3,
280     B6 * B4,
281     B6 * B5,
282     B6 * B6,
283     B6 * B7,
284 
285     B7 * B0,
286     B7 * B1,
287     B7 * B2,
288     B7 * B3,
289     B7 * B4,
290     B7 * B5,
291     B7 * B6,
292     B7 * B7,
293 };
294 
295 /*
296  * Map a quantization table in natural, row-order,
297  * into the qt input expected by rdct().
298  */
299 void
300 rdct_fold_q(const int* in, int* out)
301 {
302 #if !MMX_DCT_ENABLED
303         for (int i = 0; i < 64; ++i) {
304                 /*
305                  * Fold column and row passes of the dct.
306                  * By scaling each column DCT independently,
307                  * we pre-bias all the row DCT's so the
308                  * first multiplier is already embedded
309                  * in the temporary result.  Thanks to
310                  * Martin Vetterli for explaining how
311                  * to do this.
312                  */
313                 double v = double(in[i]);
314                 v *= first_stage[i & 7];
315                 v *= first_stage[i >> 3];
316                 out[i] = FP_SCALE(v);
317         }
318 #else
319         /* the MMX version of rdct() expects the quantization
320          * table to be an array of short */
321         for (int i = 0; i < 64; i++)
322           out[i] = in[i];
323 #endif
324 }
325 
326 /*
327  * Just like rdct_fold_q() but we divide by the quantizer.
328  */
329 void
330 fdct_fold_q(const int* in, float* out)
331 {
332         for (int i = 0; i < 64; ++i) {
333                 double v = first_stage[i >> 3];
334                 v *= first_stage[i & 7];
335                 double q = double(in[i]);
336                 out[i] = (float) (v / q);
337         }
338 }
339 
340 void dcsum(int dc, u_char* in, u_char* out, int stride)
341 {
342         for (int k = 8; --k >= 0; ) {
343                 int t;
344 #ifdef INT_64
345                 /*FIXME assume little-endian */
346                 INT_64 i = *(INT_64*)in;
347                 INT_64 o = (INT_64)LIMIT(dc + (i >> 56), t) << 56;
348                 o |=  (INT_64)LIMIT(dc + (i >> 48 & 0xff), t) << 48;
349                 o |=  (INT_64)LIMIT(dc + (i >> 40 & 0xff), t) << 40;
350                 o |=  (INT_64)LIMIT(dc + (i >> 32 & 0xff), t) << 32;
351                 o |=  (INT_64)LIMIT(dc + (i >> 24 & 0xff), t) << 24;
352                 o |=  (INT_64)LIMIT(dc + (i >> 16 & 0xff), t) << 16;
353                 o |=  (INT_64)LIMIT(dc + (i >> 8 & 0xff), t) << 8;
354                 o |=  (INT_64)LIMIT(dc + (i & 0xff), t);
355                 *(INT_64*)out = o;
356 #else
357                 u_int o = 0;
358                 u_int i = *(u_int*)in;
359                 SPLICE(o, LIMIT(dc + EXTRACT(i, 24), t), 24);
360                 SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
361                 SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
362                 SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
363                 *(u_int*)out = o;
364 
365                 o = 0;
366                 i = *(u_int*)(in + 4);
367                 SPLICE(o, LIMIT(dc + EXTRACT(i, 24),  t), 24);
368                 SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
369                 SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
370                 SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
371                 *(u_int*)(out + 4) = o;
372 #endif
373                 in += stride;
374                 out += stride;
375         }
376 }
377 
378 void dcsum2(int dc, u_char* in, u_char* out, int stride)
379 {
380         for (int k = 8; --k >= 0; ) {
381                 int t;
382                 u_int o = 0;
383                 SPLICE(o, LIMIT(dc + in[0], t), 24);
384                 SPLICE(o, LIMIT(dc + in[1], t), 16);
385                 SPLICE(o, LIMIT(dc + in[2], t), 8);
386                 SPLICE(o, LIMIT(dc + in[3], t), 0);
387                 *(u_int*)out = o;
388 
389                 o = 0;
390                 SPLICE(o, LIMIT(dc + in[4], t), 24);
391                 SPLICE(o, LIMIT(dc + in[5], t), 16);
392                 SPLICE(o, LIMIT(dc + in[6], t), 8);
393                 SPLICE(o, LIMIT(dc + in[7], t), 0);
394                 *(u_int*)(out + 4) = o;
395 
396                 in += stride;
397                 out += stride;
398         }
399 }
400 
401 void dcfill(int DC, u_char* out, int stride)
402 {
403         int t;
404         u_int dc = DC;
405         dc = LIMIT(dc, t);
406         dc |= dc << 8;
407         dc |= dc << 16;
408 #ifdef INT_64
409         INT_64 xdc = dc;
410         xdc |= xdc << 32;
411         *(INT_64 *)out = xdc;
412         out += stride;
413         *(INT_64 *)out = xdc;
414         out += stride;
415         *(INT_64 *)out = xdc;
416         out += stride;
417         *(INT_64 *)out = xdc;
418         out += stride;
419         *(INT_64 *)out = xdc;
420         out += stride;
421         *(INT_64 *)out = xdc;
422         out += stride;
423         *(INT_64 *)out = xdc;
424         out += stride;
425         *(INT_64 *)out = xdc;
426 #else
427         *(u_int*)out = dc;
428         *(u_int*)(out + 4) = dc;
429         out += stride;
430         *(u_int*)out = dc;
431         *(u_int*)(out + 4) = dc;
432         out += stride;
433         *(u_int*)out = dc;
434         *(u_int*)(out + 4) = dc;
435         out += stride;
436         *(u_int*)out = dc;
437         *(u_int*)(out + 4) = dc;
438         out += stride;
439         *(u_int*)out = dc;
440         *(u_int*)(out + 4) = dc;
441         out += stride;
442         *(u_int*)out = dc;
443         *(u_int*)(out + 4) = dc;
444         out += stride;
445         *(u_int*)out = dc;
446         *(u_int*)(out + 4) = dc;
447         out += stride;
448         *(u_int*)out = dc;
449         *(u_int*)(out + 4) = dc;
450 #endif
451 }
452 
453 /*
454  * This routine mixes the DC & AC components of an 8x8 block of
455  * pixels.  This routine is called for every block decoded so it
456  * needs to be efficient.  It tries to do as many pixels in parallel
457  * as will fit in a word.  The one complication is that it has to
458  * deal with overflow (sum > 255) and underflow (sum < 0).  Underflow
459  * & overflow are only possible if both terms have the same sign and
460  * are indicated by the result having a different sign than the terms.
461  * Note that underflow is more worrisome than overflow since it results
462  * in bright white dots in a black field.
463  * The DC term and sum are biased by 128 so a negative number has the
464  * 2^7 bit = 0.  The AC term is not biased so a negative number has
465  * the 2^7 bit = 1.  So underflow is indicated by (DC & AC & sum) != 0;
466  */
467 #define MIX_LOGIC(sum, a, b, omask, uflo) \
468 { \
469         sum = a + b; \
470         uflo = (a ^ b) & (a ^ sum) & omask; \
471         if (uflo) { \
472                 if ((b = uflo & a) != 0) { \
473                         /* integer overflows */ \
474                         b |= b >> 1; \
475                         b |= b >> 2; \
476                         b |= b >> 4; \
477                         sum |= b; \
478                 } \
479                 if ((uflo &=~ b) != 0) { \
480                         /* integer underflow(s) */ \
481                         uflo |= uflo >> 1; \
482                         uflo |= uflo >> 2; \
483                         uflo |= uflo >> 4; \
484                         sum &= ~uflo; \
485                 } \
486         } \
487 }
488 /*
489  * Table of products of 8-bit scaled coefficients
490  * and idct coefficients (there are only 33 unique
491  * coefficients so we index via a compact ID).
492  */
493 extern "C" u_char multab[];
494 /*
495  * Array of coefficient ID's used to index multab.
496  */
497 extern "C" u_int dct_basis[64][64 / sizeof(u_int)];
498 
499 /*FIXME*/
500 #define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))
501 
502 #ifdef INT_64
503 /*FIXME assume little-endian */
504 #define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)
505 #define DID4PIX
506 #define PSTORE ((INT_64*)p)[0] = pix
507 #define PIXDEF INT_64 pix = 0; int v, oflo = 0
508 #else
509 #define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)
510 #define DID4PIX pix0 = pix; pix = 0
511 #define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix
512 #define PIXDEF  u_int pix0, pix = 0; int v, oflo = 0
513 #endif
514 #define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)
515 #define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)
516 #define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)
517 #define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)
518 #define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)
519 #define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)
520 
521 #if MMX_DCT_ENABLED
522 /* this code invokes mmx version of rdct.  This routine uses the algorithm
523  * described by C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast
524  * 1-D DCT Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
525  * Speech, and Signal Processing 1989.  The algorithm itself is slower but more
526  * accurate than the AAN algorithm used in the standard version of this routine.
527  */
528 char *output_buf[8];       // array of pointers to short
529 char output_space[8][8];   // space that assembly code can write to
530 
531 void
532 #ifdef INT_64
533 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
534 #else
535 rdct(register short *bp, u_int m0, u_int m1, u_char* p,
536      int stride, const int* qt)
537 #endif
538 {
539 
540   int workspace[68];
541   short bpt[8][8];
542   short qtmx[64];
543   for (int i = 0; i < 64; i++) {
544     qtmx[i] = (short)qt[i];
545     bpt[i & 7][i >> 3] = *(bp + i);
546   }
547   domidct8x8llmW(&bpt[0][0], qtmx, workspace, p, stride);
548 
549 }
550 
551 #else
552 
553 /* The following code is the standard implementation of the DCT */
554 
555 /*
556  * A 2D Inverse DCT based on a column-row decomposition using
557  * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8
558  * Pennebaker & Mitchell (i.e., the pink JPEG book).  This figure
559  * is the forward transform; reverse the flowgraph for the inverse
560  * (you need to draw a picture).  The outputs are DFT coefficients
561  * and need to be scaled according to Eq [4-3].
562  *
563  * The input coefficients are, counter to tradition, in column-order.
564  * The bit mask indicates which coefficients are non-zero.  If the
565  * corresponding bit is zero, then the coefficient is assumed zero
566  * and the input coefficient is not referenced and need not be defined.
567  * The 8-bit outputs are computed in row order and placed in the
568  * output array pointed to by p, with each of the eight 8-byte lines
569  * offset by "stride" bytes.
570  *
571  * qt is the inverse quantization table in column order.  These
572  * coefficients are the product of the inverse quantization factor,
573  * specified by the jpeg quantization table, and the first multiplier
574  * in the inverse DCT flow graph.  This first multiplier is actually
575  * biased by the row term so that the columns are pre-scaled to
576  * eliminate the first row multiplication stage.
577  *
578  * The output is biased by 128, i.e., [-128,127] is mapped to [0,255],
579  * which is relevant to jpeg.
580  */
581 void
582 #ifdef INT_64
583 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
584 #else
585 rdct(register short *bp, u_int m0, u_int m1, u_char* p,
586      int stride, const int* qt)
587 #endif
588 {
589         /*
590          * First pass is 1D transform over the columns.  Note that
591          * the coefficients are stored in column order, so even
592          * though we're accessing the columns, we access them
593          * in a row-like fashion.  We use a column-row decomposition
594          * instead of a row-column decomposition so we can splice
595          * pixels in an efficient, row-wise order in the second
596          * stage.
597          *
598          * The inverse quantization is folded together with the
599          * first stage multiplier.  This reduces the number of
600          * multiplies (per 8-pt transform) from 11 to 5 (ignoring
601          * the inverse quantization which must be done anyway).
602          *
603          * Because we compute appropriately scaled column DCTs,
604          * the row DCTs require only 5 multiplies per row total.
605          * So the total number of multiplies for the 8x8 DCT is 80
606          * (ignoring the checks for zero coefficients).
607          */
608         int tmp[64];
609         int* tp = tmp;
610         int i;
611         for (i = 8; --i >= 0; ) {
612                 if ((m0 & 0xfe) == 0) {
613                         /* AC terms all zero */
614                         int v = M(0) ? qt[0] * bp[0] : 0;
615                         tp[0] = v;
616                         tp[1] = v;
617                         tp[2] = v;
618                         tp[3] = v;
619                         tp[4] = v;
620                         tp[5] = v;
621                         tp[6] = v;
622                         tp[7] = v;
623                 } else {
624                         int t0, t1, t2, t3, t4, t5, t6, t7;
625 
626                         if ((m0 & 0xaa) == 0)
627                                 t4 = t5 = t6 = t7 = 0;
628                         else {
629                                 t0 = M(5) ? qt[5] * bp[5] : 0;
630                                 t2 = M(1) ? qt[1] * bp[1] : 0;
631                                 t6 = M(7) ? qt[7] * bp[7] : 0;
632                                 t7 = M(3) ? qt[3] * bp[3] : 0;
633 
634                                 t4 = t0 - t7;
635                                 t1 = t2 + t6;
636                                 t6 = t2 - t6;
637                                 t7 += t0;
638 
639                                 t5 = t1 - t7;
640                                 t7 += t1;
641 
642                                 t2 = FP_MUL(-A5, t4 + t6);
643                                 t4 = FP_MUL(-A2, t4);
644                                 t0 = t4 + t2;
645                                 t1 = FP_MUL(A3, t5);
646                                 t2 += FP_MUL(A4, t6);
647 
648                                 t4 = -t0;
649                                 t5 = t1 - t0;
650                                 t6 = t1 + t2;
651                                 t7 += t2;
652                         }
653 
654 #ifdef notdef
655                         if ((m0 & 0x55) == 0)
656                                 t0 = t1 = t2 = t3 = 0;
657                         else {
658 #endif
659                                 t0 = M(0) ? qt[0] * bp[0] : 0;
660                                 t1 = M(4) ? qt[4] * bp[4] : 0;
661                                 int x0 = t0 + t1;
662                                 int x1 = t0 - t1;
663 
664                                 t0 = M(2) ? qt[2] * bp[2] : 0;
665                                 t3 = M(6) ? qt[6] * bp[6] : 0;
666                                 t2 = t0 - t3;
667                                 t3 += t0;
668 
669                                 t2 = FP_MUL(A1, t2);
670                                 t3 += t2;
671 
672                                 t0 = x0 + t3;
673                                 t1 = x1 + t2;
674                                 t2 = x1 - t2;
675                                 t3 = x0 - t3;
676 #ifdef notdef
677                         }
678 #endif
679 
680                         tp[0] = t0 + t7;
681                         tp[7] = t0 - t7;
682                         tp[1] = t1 + t6;
683                         tp[6] = t1 - t6;
684                         tp[2] = t2 + t5;
685                         tp[5] = t2 - t5;
686                         tp[3] = t3 + t4;
687                         tp[4] = t3 - t4;
688                 }
689                 tp += 8;
690                 bp += 8;
691                 qt += 8;
692 
693                 m0 >>= 8;
694 #ifndef INT_64
695                 m0 |= m1 << 24;
696                 m1 >>= 8;
697 #endif
698         }
699         tp -= 64;
700         /*
701          * Second pass is 1D transform over the rows.  Note that
702          * the coefficients are stored in column order, so even
703          * though we're accessing the rows, we access them
704          * in a column-like fashion.
705          *
706          * The pass above already computed the first multiplier
707          * in the flow graph.
708          */
709         for (i = 8; --i >= 0; ) {
710                 int t0, t1, t2, t3, t4, t5, t6, t7;
711 
712                 t0 = tp[8*5];
713                 t2 = tp[8*1];
714                 t6 = tp[8*7];
715                 t7 = tp[8*3];
716 
717 #ifdef notdef
718                 if ((t0 | t2 | t6 | t7) == 0) {
719                         t4 = t5 = 0;
720                 } else
721 #endif
722 {
723                         t4 = t0 - t7;
724                         t1 = t2 + t6;
725                         t6 = t2 - t6;
726                         t7 += t0;
727 
728                         t5 = t1 - t7;
729                         t7 += t1;
730 
731                         t2 = FP_MUL(-A5, t4 + t6);
732                         t4 = FP_MUL(-A2, t4);
733                         t0 = t4 + t2;
734                         t1 = FP_MUL(A3, t5);
735                         t2 += FP_MUL(A4, t6);
736 
737                         t4 = -t0;
738                         t5 = t1 - t0;
739                         t6 = t1 + t2;
740                         t7 += t2;
741                 }
742 
743                 t0 = tp[8*0];
744                 t1 = tp[8*4];
745                 int x0 = t0 + t1;
746                 int x1 = t0 - t1;
747 
748                 t0 = tp[8*2];
749                 t3 = tp[8*6];
750                 t2 = t0 - t3;
751                 t3 += t0;
752 
753                 t2 = FP_MUL(A1, t2);
754                 t3 += t2;
755 
756                 t0 = x0 + t3;
757                 t1 = x1 + t2;
758                 t2 = x1 - t2;
759                 t3 = x0 - t3;
760 
761                 PIXDEF;
762                 DOJPIX(t0 + t7, 0);
763                 DOJPIX(t1 + t6, 1);
764                 DOJPIX(t2 + t5, 2);
765                 DOJPIX(t3 + t4, 3);
766                 DID4PIX;
767                 DOJPIX(t3 - t4, 4);
768                 DOJPIX(t2 - t5, 5);
769                 DOJPIX(t1 - t6, 6);
770                 DOJPIX(t0 - t7, 7);
771                 if (oflo & ~0xff) {
772                         int t;
773                         pix = 0;
774                         DOJPIXLIMIT(t0 + t7, 0);
775                         DOJPIXLIMIT(t1 + t6, 1);
776                         DOJPIXLIMIT(t2 + t5, 2);
777                         DOJPIXLIMIT(t3 + t4, 3);
778                         DID4PIX;
779                         DOJPIXLIMIT(t3 - t4, 4);
780                         DOJPIXLIMIT(t2 - t5, 5);
781                         DOJPIXLIMIT(t1 - t6, 6);
782                         DOJPIXLIMIT(t0 - t7, 7);
783                 }
784                 PSTORE;
785 
786                 ++tp;
787                 p += stride;
788         }
789 
790 }
791 #endif
792 
793 /*
794  * Inverse 2-D transform, similar to routine above (see comment above),
795  * but more appropriate for H.261 instead of JPEG.  This routine does
796  * not bias the output by 128, and has an additional argument which is
797  * an input array which gets summed together with the inverse-transform.
798  * For example, this allows motion-compensation to be folded in here,
799  * saving an extra traversal of the block.  The input pointer can be
800  * null, if motion-compensation is not needed.
801  *
802  * This routine does not take a quantization table, since the H.261
803  * inverse quantizer is easily implemented via table lookup in the decoder.
804  */
805 void
806 #ifdef INT_64
807 rdct(register short *bp, INT_64 m0, u_char* p, int stride, const u_char* in)
808 #else
809 rdct(register short *bp, u_int m0, u_int m1, u_char* p, int stride, const u_char *in)
810 #endif
811 {
812         int tmp[64];
813         int* tp = tmp;
814         const int* qt = cross_stage;
815         /*
816          * First pass is 1D transform over the rows of the input array.
817          */
818         int i;
819         for (i = 8; --i >= 0; ) {
820                 if ((m0 & 0xfe) == 0) {
821                         /*
822                          * All ac terms are zero.
823                          */
824                         int v = 0;
825                         if (M(0))
826                                 v = qt[0] * bp[0];
827                         tp[0] = v;
828                         tp[1] = v;
829                         tp[2] = v;
830                         tp[3] = v;
831                         tp[4] = v;
832                         tp[5] = v;
833                         tp[6] = v;
834                         tp[7] = v;
835                 } else {
836                         int t4 = 0, t5 = 0, t6 = 0, t7 = 0;
837                         if (m0 & 0xaa) {
838                                 /* odd part */
839                                 if (M(1))
840                                         t4 = qt[1] * bp[1];
841                                 if (M(3))
842                                         t5 = qt[3] * bp[3];
843                                 if (M(5))
844                                         t6 = qt[5] * bp[5];
845                                 if (M(7))
846                                         t7 = qt[7] * bp[7];
847 
848                                 int x0 = t6 - t5;
849                                 t6 += t5;
850                                 int x1 = t4 - t7;
851                                 t7 += t4;
852 
853                                 t5 = FP_MUL(t7 - t6, A3);
854                                 t7 += t6;
855 
856                                 t4 = FP_MUL(x1 + x0, A5);
857                                 t6 = FP_MUL(x1, A4) - t4;
858                                 t4 += FP_MUL(x0, A2);
859 
860                                 t7 += t6;
861                                 t6 += t5;
862                                 t5 += t4;
863                         }
864                         int t0 = 0, t1 = 0, t2 = 0, t3 = 0;
865                         if (m0 & 0x55) {
866                                 /* even part */
867                                 if (M(0))
868                                         t0 = qt[0] * bp[0];
869                                 if (M(2))
870                                         t1 = qt[2] * bp[2];
871                                 if (M(4))
872                                         t2 = qt[4] * bp[4];
873                                 if (M(6))
874                                         t3 = qt[6] * bp[6];
875 
876                                 int x0 = FP_MUL(t1 - t3, A1);
877                                 t3 += t1;
878                                 t1 = t0 - t2;
879                                 t0 += t2;
880                                 t2 = t3 + x0;
881                                 t3 = t0 - t2;
882                                 t0 += t2;
883                                 t2 = t1 - x0;
884                                 t1 += x0;
885                         }
886                         tp[0] = t0 + t7;
887                         tp[1] = t1 + t6;
888                         tp[2] = t2 + t5;
889                         tp[3] = t3 + t4;
890                         tp[4] = t3 - t4;
891                         tp[5] = t2 - t5;
892                         tp[6] = t1 - t6;
893                         tp[7] = t0 - t7;
894                 }
895                 qt += 8;
896                 tp += 8;
897                 bp += 8;
898                 m0 >>= 8;
899 #ifndef INT_64
900                 m0 |= m1 << 24;
901                 m1 >>= 8;
902 #endif
903         }
904         tp -= 64;
905         /*
906          * Second pass is 1D transform over the rows of the temp array.
907          */
908         for (i = 8; --i >= 0; ) {
909                 int t4 = tp[8*1];
910                 int t5 = tp[8*3];
911                 int t6 = tp[8*5];
912                 int t7 = tp[8*7];
913                 if ((t4|t5|t6|t7) != 0) {
914                         /* odd part */
915                         int x0 = t6 - t5;
916                         t6 += t5;
917                         int x1 = t4 - t7;
918                         t7 += t4;
919 
920                         t5 = FP_MUL(t7 - t6, A3);
921                         t7 += t6;
922 
923                         t4 = FP_MUL(x1 + x0, A5);
924                         t6 = FP_MUL(x1, A4) - t4;
925                         t4 += FP_MUL(x0, A2);
926 
927                         t7 += t6;
928                         t6 += t5;
929                         t5 += t4;
930                 }
931                 int t0 = tp[8*0];
932                 int t1 = tp[8*2];
933                 int t2 = tp[8*4];
934                 int t3 = tp[8*6];
935                 if ((t0|t1|t2|t3) != 0) {
936                         /* even part */
937                         int x0 = FP_MUL(t1 - t3, A1);
938                         t3 += t1;
939                         t1 = t0 - t2;
940                         t0 += t2;
941                         t2 = t3 + x0;
942                         t3 = t0 - t2;
943                         t0 += t2;
944                         t2 = t1 - x0;
945                         t1 += x0;
946                 }
947                 if (in != 0) {
948                         PIXDEF;
949                         DOPIXIN(t0 + t7, 0);
950                         DOPIXIN(t1 + t6, 1);
951                         DOPIXIN(t2 + t5, 2);
952                         DOPIXIN(t3 + t4, 3);
953                         DID4PIX;
954                         DOPIXIN(t3 - t4, 4);
955                         DOPIXIN(t2 - t5, 5);
956                         DOPIXIN(t1 - t6, 6);
957                         DOPIXIN(t0 - t7, 7);
958                         if (oflo & ~0xff) {
959                                 int t;
960                                 pix = 0;
961                                 DOPIXINLIMIT(t0 + t7, 0);
962                                 DOPIXINLIMIT(t1 + t6, 1);
963                                 DOPIXINLIMIT(t2 + t5, 2);
964                                 DOPIXINLIMIT(t3 + t4, 3);
965                                 DID4PIX;
966                                 DOPIXINLIMIT(t3 - t4, 4);
967                                 DOPIXINLIMIT(t2 - t5, 5);
968                                 DOPIXINLIMIT(t1 - t6, 6);
969                                 DOPIXINLIMIT(t0 - t7, 7);
970                         }
971                         PSTORE;
972                         in += stride;
973                 } else {
974                         PIXDEF;
975                         DOPIX(t0 + t7, 0);
976                         DOPIX(t1 + t6, 1);
977                         DOPIX(t2 + t5, 2);
978                         DOPIX(t3 + t4, 3);
979                         DID4PIX;
980                         DOPIX(t3 - t4, 4);
981                         DOPIX(t2 - t5, 5);
982                         DOPIX(t1 - t6, 6);
983                         DOPIX(t0 - t7, 7);
984                         if (oflo & ~0xff) {
985                                 int t;
986                                 pix = 0;
987                                 DOPIXLIMIT(t0 + t7, 0);
988                                 DOPIXLIMIT(t1 + t6, 1);
989                                 DOPIXLIMIT(t2 + t5, 2);
990                                 DOPIXLIMIT(t3 + t4, 3);
991                                 DID4PIX;
992                                 DOPIXLIMIT(t3 - t4, 4);
993                                 DOPIXLIMIT(t2 - t5, 5);
994                                 DOPIXLIMIT(t1 - t6, 6);
995                                 DOPIXLIMIT(t0 - t7, 7);
996                         }
997                         PSTORE;
998                 }
999                 tp += 1;
1000                 p += stride;
1001         }
1002 }
1003 
1004 /*
1005  * This macro does the combined descale-and-quantize
1006  * multiply.  It truncates rather than rounds to give
1007  * the behavior required for the h.261 deadband quantizer.
1008  */
1009 #define FWD_DandQ(v, iq) short((v) * qt[iq])
1010 
1011 void fdct(const u_char* in, int stride, short* out, const float* qt)
1012 {
1013         float tmp[64];
1014         float* tp = tmp;
1015 
1016         int i;
1017         for (i = 8; --i >= 0; ) {
1018                 float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
1019                 t0 = float(in[0] + in[7]);
1020                 t7 = float(in[0] - in[7]);
1021                 t1 = float(in[1] + in[6]);
1022                 t6 = float(in[1] - in[6]);
1023                 t2 = float(in[2] + in[5]);
1024                 t5 = float(in[2] - in[5]);
1025                 t3 = float(in[3] + in[4]);
1026                 t4 = float(in[3] - in[4]);
1027 
1028                 /* even part */
1029                 x0 = t0 + t3;
1030                 x2 = t1 + t2;
1031                 tp[8*0] = x0 + x2;
1032                 tp[8*4] = x0 - x2;
1033 
1034                 x1 = t0 - t3;
1035                 x3 = t1 - t2;
1036                 t0 = (x1 + x3) * FA1;
1037                 tp[8*2] = x1 + t0;
1038                 tp[8*6] = x1 - t0;
1039 
1040                 /* odd part */
1041                 x0 = t4 + t5;
1042                 x1 = t5 + t6;
1043                 x2 = t6 + t7;
1044 
1045                 t3 = x1 * FA1;
1046                 t4 = t7 - t3;
1047 
1048                 t0 = (x0 - x2) * FA5;
1049                 t1 = x0 * FA2 + t0;
1050                 tp[8*3] = t4 - t1;
1051                 tp[8*5] = t4 + t1;
1052 
1053                 t7 += t3;
1054                 t2 = x2 * FA4 + t0;
1055                 tp[8*1] = t7 + t2;
1056                 tp[8*7] = t7 - t2;
1057 
1058                 in += stride;
1059                 tp += 1;
1060         }
1061         tp -= 8;
1062 
1063         for (i = 8; --i >= 0; ) {
1064                 float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
1065                 t0 = tp[0] + tp[7];
1066                 t7 = tp[0] - tp[7];
1067                 t1 = tp[1] + tp[6];
1068                 t6 = tp[1] - tp[6];
1069                 t2 = tp[2] + tp[5];
1070                 t5 = tp[2] - tp[5];
1071                 t3 = tp[3] + tp[4];
1072                 t4 = tp[3] - tp[4];
1073 
1074                 /* even part */
1075                 x0 = t0 + t3;
1076                 x2 = t1 + t2;
1077                 out[0] = FWD_DandQ(x0 + x2, 0);
1078                 out[4] = FWD_DandQ(x0 - x2, 4);
1079 
1080                 x1 = t0 - t3;
1081                 x3 = t1 - t2;
1082                 t0 = (x1 + x3) * FA1;
1083                 out[2] = FWD_DandQ(x1 + t0, 2);
1084                 out[6] = FWD_DandQ(x1 - t0, 6);
1085 
1086                 /* odd part */
1087                 x0 = t4 + t5;
1088                 x1 = t5 + t6;
1089                 x2 = t6 + t7;
1090 
1091                 t3 = x1 * FA1;
1092                 t4 = t7 - t3;
1093 
1094                 t0 = (x0 - x2) * FA5;
1095                 t1 = x0 * FA2 + t0;
1096                 out[3] = FWD_DandQ(t4 - t1, 3);
1097                 out[5] = FWD_DandQ(t4 + t1, 5);
1098 
1099                 t7 += t3;
1100                 t2 = x2 * FA4 + t0;
1101                 out[1] = FWD_DandQ(t7 + t2, 1);
1102                 out[7] = FWD_DandQ(t7 - t2, 7);
1103 
1104                 out += 8;
1105                 tp += 8;
1106                 qt += 8;
1107         }
1108 }
1109 
1110 /*
1111  * decimate the *rows* of the two input 8x8 DCT matrices into
1112  * a single output matrix.  we decimate rows rather than
1113  * columns even though we want column decimation because
1114  * the DCTs are stored in column order.
1115  */
1116 void
1117 dct_decimate(const short* in0, const short* in1, short* o)
1118 {
1119         for (int k = 0; k < 8; ++k) {
1120                 int x00 = in0[0];
1121                 int x01 = in0[1];
1122                 int x02 = in0[2];
1123                 int x03 = in0[3];
1124                 int x10 = in1[0];
1125                 int x11 = in1[1];
1126                 int x12 = in1[2];
1127                 int x13 = in1[3];
1128 #define X_N 4
1129 #define X_5(v)  ((v) << (X_N - 1))
1130 #define X_25(v)  ((v) << (X_N - 2))
1131 #define X_125(v)  ((v) << (X_N - 3))
1132 #define X_0625(v)  ((v) << (X_N - 4))
1133 #define X_375(v) (X_25(v) + X_125(v))
1134 #define X_625(v) (X_5(v) + X_125(v))
1135 #define X_75(v) (X_5(v) + X_25(v))
1136 #define X_6875(v) (X_5(v) + X_125(v) + X_0625(v))
1137 #define X_1875(v) (X_125(v) + X_0625(v))
1138 #define X_NORM(v) ((v) >> X_N)
1139 
1140                 /*
1141                  * 0.50000000  0.09011998  0.00000000 0.10630376
1142                  *      0.50000000  0.09011998  0.00000000  0.10630376
1143                  * 0.45306372  0.28832037  0.03732892 0.08667963
1144                  *      -0.45306372  0.11942621  0.10630376 -0.06764951
1145                  * 0.00000000  0.49039264  0.17677670 0.00000000
1146                  *      0.00000000 -0.49039264 -0.17677670  0.00000000
1147                  * -0.15909482  0.34009707  0.38408888 0.05735049
1148                  *      0.15909482  0.43576792 -0.09011998 -0.13845632
1149                  * 0.00000000 -0.03732892  0.46193977 0.25663998
1150                  *      0.00000000 -0.03732892  0.46193977  0.25663998
1151                  * 0.10630376 -0.18235049  0.25663998 0.42361940
1152                  *      -0.10630376 -0.16332037 -0.45306372 -0.01587282
1153                  * 0.00000000  0.00000000 -0.07322330 0.41573481
1154                  *      0.00000000  0.00000000  0.07322330 -0.41573481
1155                  * -0.09011998  0.13399123 -0.18766514 0.24442621
1156