/* Common stuff */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef HAVE_INTELCPU

#include "video_def.h"

.globl	calc_diff128
.globl	calc_intg128

	.data

mm_0:
.byte	0, 0, 0, 0, 0, 0, 0, 0

mm_128b:
.byte	128, 128, 128, 128, 128, 128, 128, 128
mm_128w:
.word	128, 128, 128, 128

	.text

#ifndef HAVE_MMX

calc_diff128:
	enter $0, $0
	push %ebx
	push %esi
	push %edi

	mov Number, %ecx	# number of pixels
	mov Dst, %edi		# destination
	mov Src1, %esi		# src_new
	mov Src2, %ebx		# src_old

0:	lodsb			# load byte
	mov (%ebx), %ah		# load second byte
	shr $1, %al		# /2 by shift-right both values
	shr $1, %ah
	sub %ah, %al		# substract old value
	add $128, %al		# add virtual 0-point
	stosb
	inc %ebx
	loop 0b

9:	pop %edi
	pop %esi
	pop %ebx
	leave
	ret

#else /* HAVE_MMX */

calc_diff128:
	enter $0, $0
	push %ebx
	push %esi
	push %edi

	mov Number, %ecx	# number of pixels
	mov Dst, %edi		# destination
	mov Src1, %esi		# src_new
	mov Src2, %ebx		# src_old

	shr $2, %ecx		# MMX uses 4 bytes in a row; too bad MMX
				# doesnt have carry-bits or shift-right byte,
				# or we could have grabbed 8 bytes in a row
	movq mm_128w, %mm7

0:	movd (%esi), %mm0	# 00 00 00 00 b3 b2 b1 b0
	punpcklbw mm_0, %mm0	# 00 b3 00 b2 00 b1 00 b0

	movd (%ebx), %mm1	# 00 00 00 00 c3 c2 c1 c0
	punpcklbw mm_0, %mm1	# 00 c3 00 c2 00 c1 00 c0

	psraw $1, %mm0		# shift, then substract; this eliminates a
	psraw $1, %mm1		#  division (costly) or a shift after
				# substraction (negative bias)
	psubw %mm1, %mm0	# subtract
	paddw %mm7, %mm0	# add virtual 0-point
	packuswb mm_0, %mm0	# pack, unsigned saturation 00 00 00 00 d3 d2 d1 d0
	movd %mm0, (%edi)	# store

	add $4, %ebx
	add $4, %esi
	add $4, %edi
	loop 0b

9:	emms			# empty MMX state
	pop %edi
	pop %esi
	pop %ebx
	leave
	ret

#endif /* HAVE_MMX */


#ifndef HAVE_MMX

# calc_intg: perform Integrator step, that is, add a buffer to another
calc_intg128:
	enter $0, $0
	push %ebx
	push %esi
	push %edi

	mov  8(%ebp), %ecx	# number of pixels
	mov 12(%ebp), %edi	# destination
	mov 16(%ebp), %esi	# 2nd buffer

0:	lodsb
	sub $128, %al		# s - 128
	shl $1, %al		# *2
	add %al, (%edi)		# add to destination
	inc %edi
	loop 0b

9:	pop %edi
	pop %esi
	pop %ebx
	leave
	ret

#else /* HAVE_MMX */

# Perform Integrator step with MMX instructions
calc_intg128:
	enter $0, $0
	push %ebx
	push %esi
	push %edi

	mov  8(%ebp), %ecx	# number of pixels
	mov 12(%ebp), %edi	# destination
	mov 16(%ebp), %esi	# 2nd buffer
	shr $2, %ecx		# divide by 4 (again, we must use word operands)

	movq mm_128w, %mm7	# load constant

				# The following instructions are hopefully mixed
				#  to optimize execution
0:	movd (%edi), %mm0	# 00 00 00 00 b3 b2 b1 b0
	punpcklbw mm_0, %mm0	# 00 b3 00 b2 00 b1 00 b0
	movd (%esi), %mm1	# 00 00 00 00 c3 c2 c1 c0
	punpcklbw mm_0, %mm1	# 00 c3 00 c2 00 c1 00 c0
	add $4, %esi		# keep CPU busy
	psubw %mm7, %mm1	# src - 128
	psllw $1, %mm1		# * 2
	paddw %mm1, %mm0	# dst += src
	packuswb mm_0, %mm0	# pack, unsigned saturation 00 00 00 00 d3 d2 d1 d0
	movd %mm0, (%edi)	# store back
	add $4, %edi
	loop 0b

9:	emms			# clear MMX state
	pop %edi
	pop %esi
	pop %ebx
	leave
	ret

#endif /* HAVE_MMX */

#endif /* HAVE_INTELCPU */
