#include "stdafx.h"
#pragma  hdrstop

/***************************************************************************
*
*                INTEL Corporation Proprietary Information  
*
*      
*                  Copyright (c) 1996 Intel Corporation.
*                         All rights reserved.
*
***************************************************************************
			AUTHOR:  Kumar Balasubramanian 
***************************************************************************

** MMX version of the "integer LLM mode" within IJG decompressor code.
** The following is an MMX implementation of the integer slow mode
** IDCT within the IJG code.
*/




#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h"		/* Private declarations for DCT subsystem */

#ifdef DCT_ISLOW_SUPPORTED


/*
 * This module is specialized to the case DCTSIZE = 8.
 */

#if DCTSIZE != 8
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
#endif



#if BITS_IN_JSAMPLE == 8
#define CONST_BITS  13
#define PASS1_BITS  2
#else
#define CONST_BITS  13
#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
#endif

/* Define the constants for the case BITS_IN_JSAMPLE = 8 */

static const __int64 const_0_2986	=	0x0000098E0000098E ;
static const __int64 const_0_3901	=	0x00000c7c00000c7c;
static const __int64 const_0_54119	=	0x0000115100001151;
static const __int64 const_0_7653	=	0x0000187E0000187E;
static const __int64 const_0_899	=	0x00001ccd00001ccd;
static const __int64 const_1_175	=	0x000025a1000025a1;
static const __int64 const_1_501	=	0x0000300b0000300b;
static const __int64 const_1_8477	=	0x00003b2100003b21;
static const __int64 const_1_961	=	0x00003ec500003ec5 ;
static const __int64 const_2_053	=	0x000041b3000041b3 ;
static const __int64 const_2_562	=	0x0000520300005203 ;
static const __int64 const_3_072	=	0x0000625400006254 ;

static const __int64 const_all_ones	=	0x0ffffffffffffffff;	
static const __int64 const_0_1_0_1		=	0x0000000100000001	 ;
static const __int64 const_zero		=	0x0000000000000000;	
static const __int64 const_1_0			=	0x0000000100000001	;
static const __int64 const_round		=	0x0000040000000400;
static const __int64 const_round_two	=	0x0002000000020000;
static const __int64 const_mask		=  0x000003ff000003ff;

static const __int64 const_00_1_84_00_0_765	=	0x00003b210000187E;
static const __int64 const_00_0_5411_00_00		=	0x0000115100000000;
static const __int64 const_3_072_00_1_501_00	=	0x62540000300b0000;
static const __int64 const_0_2986_00_2_053_00	=	0x098E000041b30000;
static const __int64 const_0_899_00_2_562_00	=   0x1ccd000052030000;
static const __int64 const_1_96_00_0_3901_00	=   0x3ec500000c7c0000;
static const __int64 const_1_175_00_00_00		=	0x25a1000000000000;







/*
 * Perform dequantization and inverse DCT on one block of coefficients.
 */

GLOBAL(void)
midct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr,
		 JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
{

	INT32 locdwinptr,	locdwqptr, locdwwsptr, locdwcounter, locdwrowctr ;
__int64 locqwtmp0e,locqwtmp0o, locqwtmp1e, locqwtmp1o, locqwtmp2e ;

__int64 locqwtmp10e	, locqwtmp10o	,locqwtmp11e	,
		 locqwtmp11o	, locqwtmp12e	, locqwtmp12o	,
		 locqwtmp13e	, locqwtmp13o	,locqwtmp0	,
		locqwtmp1	,locqwtmp2	,locqwtmp3	,
		locqwz5e ,locqwz5o	,locqwz1e ,locqwz1o	,
		locqwz13e	,locqwz13o	,locqwz14e	,
		locqwz14o	,locqwz23e	,locqwz23o	,
		locqwz24e	,locqwz24o ;




// Inline assembly to do the IDCT and store the result */

__asm {

mov	esi, inptr	; load the input pointer
mov edi, quantptr		; load the quant table pointer

mov locdwinptr, esi	; to be used in the idct_column loop
mov locdwqptr, edi	; to be used in the idct_column loop

mov esi, wsptr
mov locdwcounter, 2	; idct_column loop counter

mov locdwwsptr, esi



;; do the idct on all the columns. Do four columns per
;; iteration of the loop.

idct_column:

mov		esi, locdwinptr	; get the source pointer
mov		edi, locdwqptr		; get the quantzn. pointer

;; fetch C2 and Q2
movq	mm0,  [esi+16*2]	; get C2

movq	mm1,  [edi+16*2]	; get Q2

movq	mm2,  [esi+16*6]	; get C6
pmullw	mm0, mm1		; dequantized C2 = z2

movq	mm3, [edi+16*6]	; get Q6

movq	mm6,  const_0_7653	
pmullw	mm2, mm3		; dequant. C6 = z3

movq	mm7,  const_1_8477	
movq	mm4, mm0		; copy z2

pmaddwd	mm4, mm6		; tmp3 - z1 for columns 0 & 2
movq	mm5, mm0		; copy z2

movq	mm3, mm2		; z3 copy
psrlq	mm5, 16			; move z2 columns 1 & 3 to 0 & 2

movq	mm1,  const_0_54119
pmaddwd	mm5, mm6		; tmp3 - z1 for columns 1 & 3

psrlq	mm3, 16			; move z3 columns 1 & 3 to 0 & 2
paddw	mm0, mm2		; z2 + z3

pmaddwd	mm2, mm7		; tmp2 - z1 for columns 0 & 2
movq	mm6, mm0		; z2 + z3 copy

psrlq	mm6, 16			; z2 + z3 columns 1 & 3 in 0 & 2
pmaddwd	mm3, mm7		; tmp2 - z1 for columns 1 & 3

movq	mm7,  const_all_ones
pmaddwd	mm0, mm1		; z1 columns 0 & 2

pmaddwd	mm6, mm1		; z1 columns 1 & 3
pxor	mm2, mm7		; 1s complement of tmp2 - z1

movq	mm1,  const_0_1_0_1
pxor	mm3, mm7		; 1s complement of tmp2 - z1 

paddd	mm2, mm1		; 2s complement of tmp2 - z1(col 0 &2)
paddd	mm3, mm1		; 2s complement of tmp2 - z1(col 1 & 3)

paddd	mm2, mm0		; tmp2 (columns 0 & 2)
paddd	mm4, mm0		; tmp2 (cols. 1 & 3)

;; get C0 and Q0
movq	mm0,  [esi+16*0]	; get C0
paddd	mm3, mm6		; tmp3

movq	mm1,  [edi+16*0]	; getQ0
paddd	mm5, mm6		; tmp3

movq	mm6,  [esi+16*4]	; get C4
pmullw	mm0, mm1		; dequant C0 = z2

movq	mm7,  [edi+16*4]	; get Q4
nop

movq	locqwtmp2e, mm2	; store tmp2 even part
pmullw	mm6, mm7		; dequant C4 = z3

movq	mm7,  const_1_0
movq	mm1, mm0		; copy of z2

paddw	mm0, mm6		; z2+z3
nop

psubw	mm1, mm6		; z2-z3
movq	mm6, mm0		; z2+z3 copy

pmaddwd	mm0, mm7		; get 0 & 2 cols
psrlq	mm6, 16			; get the other two cols.

pmaddwd	mm6, mm7		; 
movq	mm2, mm1		; copy of z2-z3

pmaddwd	mm1, mm7
psrlq	mm2, 16

pmaddwd	mm2, mm7
pslld	mm0, 13			; tmp0 cols 0&2

movq	mm7, mm4
pslld	mm6, 13			; tmp0 cols 1 & 3

paddd	mm4, mm0		; 
psubd	mm0, mm7		; 

movq	mm7, mm5
pslld	mm2, 13

movq	locqwtmp13e, mm0	; store tmp13 cols 0&2
paddd	mm5, mm6

movq	mm0, locqwtmp2e
psubd	mm6, mm7


movq	locqwtmp10o, mm5	; store tmp10 cols 1&3
movq	mm7, mm3

movq	locqwtmp13o, mm6	; store tmp13 cols 1&3
paddd	mm3, mm2

movq	locqwtmp10e, mm4	; store tmp10 cols 0&2
pslld	mm1, 13

movq	locqwtmp11o, mm3	; store tmp11 cols 1,3
psubd	mm2, mm7

movq	mm6,  [esi+16*1]
movq	mm3, mm0

movq	locqwtmp12o, mm2	; store tmp12 cols. 1,3
paddd	mm0, mm1

movq	mm7,  [edi+16*1]

movq	locqwtmp11e, mm0	; store tmp11 cols. 0,2
psubd	mm1, mm3

movq	mm0,  [esi+16*7]
pmullw	mm6, mm7	; dequant. C1 = tmp3

movq	locqwtmp12e, mm1

;; completed the even part.
;; Now start the odd part

movq	mm1,  [edi+16*7]	; get C7

movq	mm2,  [esi+16*5]	; get C5
pmullw	mm0, mm1	; dequant. C7 = tmp0

movq	mm3,  [edi+16*5]

movq	mm4,  [esi+16*3]
pmullw	mm2, mm3	; dequant. C5 = tmp1

movq	mm5,  [edi+16*3]
movq	mm1, mm0

movq	locqwtmp3, mm6
pmullw	mm4, mm5	; dequant. C3 = tmp2

movq	locqwtmp0, mm0
paddw	mm0, mm6	; z1 

movq	locqwtmp1, mm2
movq	mm3, mm2

movq	locqwtmp2, mm4
paddw	mm2, mm4	; z2

paddw	mm1, mm4	; z3

movq	mm4,  const_1_175
paddw	mm3, mm6	; z4	

movq	mm5, mm1
movq	mm7, mm0

psrlq	mm7, 16		; other two cols. of z1
paddw	mm5, mm3	; z3 + z4

movq	mm6, mm5
pmaddwd	mm5, mm4	; z5 cols 0 & 2

pmaddwd	mm0,  const_0_899	; z1 even part
psrlq	mm6, 16

pmaddwd	mm6, mm4	; z5 cols 1 & 3
movq	mm4, mm2	; z2 copy

movq	locqwz5e, mm5
psrlq	mm4, 16		; get z2 cols 1 & 3

pxor	mm0,  const_all_ones
movq	mm5, mm1

movq	locqwz5o, mm6
psrlq	mm5, 16

movq	mm6,  const_2_562
nop

paddd	mm0,  const_0_1_0_1
pmaddwd	mm2, mm6	; z2 cols 0 & 2

movq	locqwz1e, mm0
pmaddwd	mm4, mm6	; z2 cols 1 & 3

pmaddwd	mm7,  const_0_899	; z1
movq	mm0, mm3

movq	mm6,  const_1_961
psrlq	mm0, 16

pxor	mm2,  const_all_ones
pmaddwd	mm1, mm6	; z3 cols 0 & 2

paddd	mm2,  const_0_1_0_1
pmaddwd	mm5, mm6	; z3 cols 1 & 3

movq	mm6,  const_0_3901
nop

pxor	mm4,  const_all_ones
pmaddwd	mm3, mm6	; z4 cols 0 & 2

paddd	mm4,  const_0_1_0_1
pmaddwd	mm0, mm6	; z4 cols 1 & 3

movq	mm6,  const_all_ones
nop

pxor	mm1, mm6
pxor	mm7, mm6

;; twos complement of z1, z2, z3, z4

paddd	mm1,  const_0_1_0_1	
pxor	mm5, mm6

paddd	mm7,  const_0_1_0_1
pxor	mm3, mm6

paddd	mm5,  const_0_1_0_1
nop

movq	locqwz1o, mm7
pxor	mm0, mm6

paddd	mm1, locqwz5e	; z3+z5 cols 0 & 2
nop

movq	mm6, locqwz1e
nop

paddd	mm5, locqwz5o	; z3+z5 cols 1 & 3
paddd	mm6, mm1

paddd	mm3,  const_0_1_0_1
paddd	mm1, mm2

paddd	mm0,  const_0_1_0_1
paddd	mm7, mm5

paddd	mm3, locqwz5e	; z4+z5 cols 0 & 2
paddd	mm5, mm4

paddd	mm0, locqwz5o	; z4+z5 cols 0 & 2
paddd	mm2, mm3

paddd	mm3, locqwz1e
paddd	mm4, mm0

paddd	mm0, locqwz1o

movq	locqwz23e, mm1
nop

movq	locqwz14o, mm0
nop

movq	mm0, locqwtmp0
nop

movq	locqwz24e, mm2
movq	mm1, mm0

movq	mm2,  const_0_2986
psrlq	mm1, 16

movq	locqwz14e, mm3
pmaddwd	mm0, mm2	; tmp0 even

movq	mm3, locqwtmp1
pmaddwd	mm1, mm2	; tmp0 odd

movq	locqwz24o, mm4
movq	mm2, mm3

movq	mm4,  const_2_053
psrlq	mm2, 16

movq	locqwz23o, mm5
pmaddwd	mm3, mm4	; tmp1 even

movq	mm5, locqwtmp2
pmaddwd	mm2, mm4	; tmp1 odd

movq	locqwz13e, mm6
movq	mm4, mm5

movq	mm6,  const_3_072
psrlq	mm4, 16

movq	locqwz13o, mm7
pmaddwd	mm5, mm6	; tmp2 even
	
;;;;;;; now calculate tmp0..tmp3
;; then calculate the pre-descaled values
;; this includes the right shift with rounding

movq	mm7, locqwtmp3
pmaddwd	mm4, mm6	; tmp2 odd

paddd	mm0, locqwz13e
movq	mm6, mm7

paddd	mm1, locqwz13o
psrlq	mm6, 16

movq	locqwtmp0e, mm0		; tmp0 even
nop

movq	mm0,  const_1_501
nop

movq	locqwtmp0o, mm1
pmaddwd	mm7, mm0

paddd	mm3, locqwz24e
pmaddwd	mm6, mm0

movq	mm0, locqwtmp10e
nop

paddd	mm7, locqwz14e
nop

paddd	mm6, locqwz14o
psubd	mm0, mm7

movq	mm1, locqwtmp10o
nop

movq	locqwtmp1e, mm3
psubd	mm1, mm6

movq	mm3,  const_round
nop

paddd	mm2, locqwz24o
paddd	mm0, mm3

paddd	mm7, locqwtmp10e
psrad	mm0, 11

movq	locqwtmp1o, mm2
paddd	mm1, mm3

paddd	mm6, locqwtmp10o
psrad	mm1, 11

paddd	mm5, locqwz23e
movq	mm2, mm0

paddd	mm4, locqwz23o
punpcklwd	mm0, mm1

paddd	mm6, mm3
punpckhwd	mm2, mm1

paddd	mm7, mm3
punpckldq	mm0, mm2

;; now do all the stores of the 1D-iDCT of the four columns

mov		edi, locdwwsptr	; get pointer to scratch pad array

movq	 [edi+16*7], mm0	; store wsptr[7]
psrad	mm6, 11

movq	mm2, locqwtmp11e
psrad	mm7, 11

psubd	mm2, mm5
movq	mm0, mm7

movq	mm1, locqwtmp11o
punpcklwd	mm7, mm6

psubd	mm1, mm4
punpckhwd	mm0, mm6

paddd	mm5, locqwtmp11e
punpckldq	mm7, mm0

paddd	mm4, locqwtmp11o
paddd	mm2, mm3

paddd	mm1, mm3
paddd	mm5, mm3

paddd	mm4, mm3
psrad	mm2, 11

movq	 [edi+16*0], mm7	; store wsptr[0]
psrad	mm1, 11

movq	mm0, mm2
psrad	mm5, 11

movq	mm6, locqwtmp12e
punpcklwd	mm2, mm1

punpckhwd	mm0, mm1
movq	mm1, mm5

movq	mm7, locqwtmp12o
punpckldq	mm2, mm0

movq	 [edi+16*6], mm2	; store wsptr[6]
psrad	mm4, 11

movq	mm2, mm6
punpcklwd	mm5, mm4

paddd	mm6, locqwtmp1e
punpckhwd	mm1, mm4

psubd	mm2, locqwtmp1e
punpckldq	mm5, mm1

movq	 [edi+16*1], mm5	; store wsptr[1]
movq	mm0, mm7

paddd	mm7, locqwtmp1o
paddd	mm6, mm3

psubd	mm0, locqwtmp1o
paddd	mm7, mm3

paddd	mm2, mm3
psrad	mm7, 11

paddd	mm0, mm3
psrad	mm6, 11

movq	mm1, mm6
psrad	mm2, 11

movq	mm4, locqwtmp13e
punpcklwd	mm6, mm7

movq	mm5, mm4
punpckhwd	mm1, mm7

paddd	mm4, locqwtmp0e
punpckldq	mm6, mm1

psubd	mm5, locqwtmp0e
psrad	mm0, 11

movq	 [edi+16*2], mm6	; store wsptr[2]
movq	mm6, mm2

paddd	mm4, mm3
punpcklwd	mm2, mm0

paddd	mm5, mm3
punpckhwd	mm6, mm0

movq	mm0, locqwtmp13o
punpckldq	mm2, mm6

movq	mm1, mm0
psrad	mm4, 11

paddd	mm0, locqwtmp0o
psrad	mm5, 11

paddd	mm0, mm3
movq	mm6, mm4

psubd	mm1, locqwtmp0o
psrad	mm0, 11

paddd	mm1, mm3
punpcklwd	mm4, mm0

movq	mm3, mm5
punpckhwd	mm6, mm0

movq	 [edi+16*5], mm2	; store wsptr[5]
punpckldq	mm4, mm6

psrad	mm1, 11

movq	 [edi+16*3], mm4	; store wsptr[3]
punpcklwd	mm5, mm1

punpckhwd	mm3, mm1

punpckldq	mm5, mm3

add locdwinptr, 8	; skip first four columns
add	locdwqptr,  8

movq	 [edi+16*4], mm5	; store wsptr[4]


;;;;;;; done with 1D-idct of four columns ;;;;;;;

;; now update pointers for next four columns

add locdwwsptr, 8
mov	eax, locdwcounter

dec eax

mov locdwcounter, eax
jnz idct_column

;;;;;;;end of 1D-idct on the columns ;;;;;;;

mov	esi, wsptr	; get start addr of temp array
mov locdwcounter, 8

mov	locdwwsptr, esi
mov	locdwrowctr, 0

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;; start of 1D-idct on the rows ;;;;;;;


idct_row:

mov	esi, locdwwsptr	; get next row start addr of temp array
mov	edi, output_buf	

movq	mm0,  [esi+0]	; get first 4 elements of row

movq	mm1,  [esi+2*4] ; get next 4 elem. of row
movq	mm2, mm0

movq	mm3, mm0	; copy of e3|e2|e1|e0
paddw	mm2, mm1	; (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)

movq	mm4, mm2	; copy of (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)
punpckhdq	mm3, mm1	; e7|e6|e3|e2

pmaddwd	mm3,  const_00_1_84_00_0_765	; (tmp2 - z1)||(tmp3-z1)
movq	mm6, mm0	; copy of e3|e2|e1|e0

pmaddwd	mm2,  const_00_0_5411_00_00	; z1||xxx
psubw	mm6, mm1	; (e3-e7)|(e2-e6)|(e1-e5)|(e0-e4)

punpckldq	mm4, mm6	; (e1-e5)|(e0-e4)|(e1+e5)|(e0+e4)
movq	mm6, mm0	; 

movq	mm5, mm3
pslld	mm4, 16	; (e0-e4)|(e1+e5)||(e0+e4)|x0000

pxor	mm3,  const_all_ones
punpckhdq	mm2, mm2	; z1||z1

paddd	mm3,  const_0_1_0_1
psrad	mm4, 3	; (e0-e4)<<13||(e0+e4)<<13

psrlq	mm3, 32
movq	mm7, mm4	; copy of tmp1||tmp0

punpckldq	mm5, mm3
movq	mm3, mm0	; e3|e2|e1|e0

paddd	mm5, mm2		; tmp2 || tmp3
paddw	mm3, mm1	; (e7+e3)|(e2+e6)|(e1+e5)|(e0+e4)

paddd	mm4, mm5
psubd	mm7, mm5


;; end of even part calculation ;;
;; mm0 => e3|e2|e1|e0
;; mm1 => e7|e6|e5|e4
;; mm4 => tmp11||tmp10
;; mm7 => tmp12||tmp13

movq	mm5, mm3
movq	mm2, mm0

pmaddwd	mm0,  const_3_072_00_1_501_00	; tmp2|tmp3
punpckldq	mm5, mm5

paddw	mm5, mm3
punpckldq	mm2, mm2

pmaddwd	mm5,  const_1_175_00_00_00		; z5|0
punpckhdq	mm6, mm2

pmaddwd		mm3,  const_1_96_00_0_3901_00	; z3|z4
paddw	mm6, mm1

pmaddwd		mm6,  const_0_899_00_2_562_00	; z1|z2
nop

pmaddwd		mm1,  const_0_2986_00_2_053_00	; tmp0|tmp1
punpckhdq	mm5, mm5

movq	mm2,  const_0_1_0_1
nop

pxor	mm3,  const_all_ones
nop

pxor	mm6,  const_all_ones
paddd	mm3, mm2

paddd	mm6, mm2
paddd	mm3, mm5

movq	mm5, mm6
paddd	mm6, mm3

movq	mm2, mm5
punpckldq	mm5, mm5

punpckhdq	mm2, mm5
paddd	mm1, mm6

paddd	mm2, mm3
movq	mm5, mm1

movq	mm3, mm4
paddd	mm0, mm2

movq	mm2, mm7
punpckldq	mm5, mm5

punpckhdq	mm1, mm5
psubd	mm3, mm0

movq	mm5,  const_round_two
paddd	mm0, mm4

movq	mm6,  const_mask
psubd	mm2, mm1

paddd	mm0, mm5
paddd	mm1, mm7



;; descale the resulting coeff values
paddd	mm1, mm5
psrad	mm0, 18

paddd	mm3, mm5
psrad	mm1, 18

paddd	mm2, mm5
psrad	mm3, 18


;; mask the result with RANGE_MASK (least 10 bits)
pand	mm1, mm6	; w2|w3
psrad	mm2, 18

movd	ebx, mm1	; w3
psrlq	mm1, 32		; 0|w2

;; using the results as index, get the corresponding
;; value from array range_limit and store the final result

mov		ecx, range_limit	; get start addr of range_limit array
add	edi, locdwrowctr

movd	edx, mm1	; w2
pand	mm0, mm6	; w1|w0

mov		ah, [ecx][ebx]	; w3
mov		edi, [edi]

movd	ebx, mm0	; w0
psrlq	mm0, 32		; 0|w1

mov		al, [ecx][edx]	; w2
add	locdwrowctr, 4

movd	edx, mm0	; w1
pand	mm3, mm6	; w6|w7

add	edi, output_col	; this is the dest start addr for this row
shl		eax, 16		; w3|w2|0|0

mov		al, [ecx][ebx]	; w0

mov		ah, [ecx][edx]	; w1

movd	mm4, eax	; w3|w2|w1|w0
pand	mm2, mm6	; w5|w4

movd	ebx, mm3	; w7
psrlq	mm3, 32		; 0|w6

movd	edx, mm3	; w6

mov		ah, [ecx][ebx]	; w7

mov		al, [ecx][edx]	; w6

movd	ebx, mm2	; w4
psrlq	mm2, 32		; 0|w5

shl		eax, 16		; w7|w6|0|0

movd	edx, mm2	; w5

mov		al, [ecx][ebx]	; w4

mov		ah, [ecx][edx]	; w5

movd	mm5, eax	; w7|w6|w5|w4

punpckldq	mm4, mm5	; w7|w6|w5|w4|w3|w2|w1|w0

add	locdwwsptr, 16
mov	eax, locdwcounter

movq	 [edi], mm4

;; update address pointer and loop counter

dec eax

mov	locdwcounter, eax
jnz	idct_row

;;;;;;; end of 1D-idct on all the rows ;;;;;;;
 


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

emms


} //end of __asm

}

#endif /* DCT_ISLOW_SUPPORTED */
