/*  Vector Dot Product
 *  This program computes a simple vector dot product using hard
 * wired input buffers of 128 samples each. These values are in
 * 1.15 signed .
 */
# mach: bfin

.include "testutils.inc"
	start

	// load buffer addresses into pointer regs
	loadsym I0, data0;
	loadsym I1, data1;

	// loop control
	// number of loop iterations is 2^N with r4|=1<<N
	// to process 128 samples need 64 iterations
	P4 = 63;
	LSETUP ( loop1 , loop1 ) LC0 = P4;
	A1 = A0 = 0;

	// For now,  serialize two 32b loads.
	// These should be done in parallel with the dual mac.

	R0 = [ I0 ++ ];	R1 = [ I1 ++ ];

loop1:  A1 += R0.H * R1.H, A0 += R0.L * R1.L || R0 = [ I0 ++ ] || R1 = [ I1 ++ ];

	A1 += R0.H * R1.H, A0 += R0.L * R1.L;

	// extract two partial results from accumulators
	// and do final addition
	R0 = ( A0 += A1 );

	DBGA ( R0.L , 0x5600 );	// 0x00545600 = 0.002574 fract
	DBGA ( R0.H , 0x0054 );

	pass

	.data
data0:
	.dw 0x0
	.dw 0x2
	.dw 0x4
	.dw 0x6
	.dw 0x8
	.dw 0xA
	.dw 0xC
	.dw 0xE
	.dw 0x10
	.dw 0x12
	.dw 0x14
	.dw 0x16
	.dw 0x18
	.dw 0x1A
	.dw 0x1C
	.dw 0x1E
	.dw 0x20
	.dw 0x22
	.dw 0x24
	.dw 0x26
	.dw 0x28
	.dw 0x2A
	.dw 0x2C
	.dw 0x2E
	.dw 0x30
	.dw 0x32
	.dw 0x34
	.dw 0x36
	.dw 0x38
	.dw 0x3A
	.dw 0x3C
	.dw 0x3E
	.dw 0x40
	.dw 0x42
	.dw 0x44
	.dw 0x46
	.dw 0x48
	.dw 0x4A
	.dw 0x4C
	.dw 0x4E
	.dw 0x50
	.dw 0x52
	.dw 0x54
	.dw 0x56
	.dw 0x58
	.dw 0x5A
	.dw 0x5C
	.dw 0x5E
	.dw 0x60
	.dw 0x62
	.dw 0x64
	.dw 0x66
	.dw 0x68
	.dw 0x6A
	.dw 0x6C
	.dw 0x6E
	.dw 0x70
	.dw 0x72
	.dw 0x74
	.dw 0x76
	.dw 0x78
	.dw 0x7A
	.dw 0x7C
	.dw 0x7E
	.dw 0x80
	.dw 0x82
	.dw 0x84
	.dw 0x86
	.dw 0x88
	.dw 0x8A
	.dw 0x8C
	.dw 0x8E
	.dw 0x90
	.dw 0x92
	.dw 0x94
	.dw 0x96
	.dw 0x98
	.dw 0x9A
	.dw 0x9C
	.dw 0x9E
	.dw 0xA0
	.dw 0xA2
	.dw 0xA4
	.dw 0xA6
	.dw 0xA8
	.dw 0xAA
	.dw 0xAC
	.dw 0xAE
	.dw 0xB0
	.dw 0xB2
	.dw 0xB4
	.dw 0xB6
	.dw 0xB8
	.dw 0xBA
	.dw 0xBC
	.dw 0xBE
	.dw 0xC0
	.dw 0xC2
	.dw 0xC4
	.dw 0xC6
	.dw 0xC8
	.dw 0xCA
	.dw 0xCC
	.dw 0xCE
	.dw 0xD0
	.dw 0xD2
	.dw 0xD4
	.dw 0xD6
	.dw 0xD8
	.dw 0xDA
	.dw 0xDC
	.dw 0xDE
	.dw 0xE0
	.dw 0xE2
	.dw 0xE4
	.dw 0xE6
	.dw 0xE8
	.dw 0xEA
	.dw 0xEC
	.dw 0xEE
	.dw 0xF0
	.dw 0xF2
	.dw 0xF4
	.dw 0xF6
	.dw 0xF8
	.dw 0xFA
	.dw 0xFC
	.dw 0xFE

data1:
	.dw 0x0
	.dw 0x2
	.dw 0x4
	.dw 0x6
	.dw 0x8
	.dw 0xA
	.dw 0xC
	.dw 0xE
	.dw 0x10
	.dw 0x12
	.dw 0x14
	.dw 0x16
	.dw 0x18
	.dw 0x1A
	.dw 0x1C
	.dw 0x1E
	.dw 0x20
	.dw 0x22
	.dw 0x24
	.dw 0x26
	.dw 0x28
	.dw 0x2A
	.dw 0x2C
	.dw 0x2E
	.dw 0x30
	.dw 0x32
	.dw 0x34
	.dw 0x36
	.dw 0x38
	.dw 0x3A
	.dw 0x3C
	.dw 0x3E
	.dw 0x40
	.dw 0x42
	.dw 0x44
	.dw 0x46
	.dw 0x48
	.dw 0x4A
	.dw 0x4C
	.dw 0x4E
	.dw 0x50
	.dw 0x52
	.dw 0x54
	.dw 0x56
	.dw 0x58
	.dw 0x5A
	.dw 0x5C
	.dw 0x5E
	.dw 0x60
	.dw 0x62
	.dw 0x64
	.dw 0x66
	.dw 0x68
	.dw 0x6A
	.dw 0x6C
	.dw 0x6E
	.dw 0x70
	.dw 0x72
	.dw 0x74
	.dw 0x76
	.dw 0x78
	.dw 0x7A
	.dw 0x7C
	.dw 0x7E
	.dw 0x80
	.dw 0x82
	.dw 0x84
	.dw 0x86
	.dw 0x88
	.dw 0x8A
	.dw 0x8C
	.dw 0x8E
	.dw 0x90
	.dw 0x92
	.dw 0x94
	.dw 0x96
	.dw 0x98
	.dw 0x9A
	.dw 0x9C
	.dw 0x9E
	.dw 0xA0
	.dw 0xA2
	.dw 0xA4
	.dw 0xA6
	.dw 0xA8
	.dw 0xAA
	.dw 0xAC
	.dw 0xAE
	.dw 0xB0
	.dw 0xB2
	.dw 0xB4
	.dw 0xB6
	.dw 0xB8
	.dw 0xBA
	.dw 0xBC
	.dw 0xBE
	.dw 0xC0
	.dw 0xC2
	.dw 0xC4
	.dw 0xC6
	.dw 0xC8
	.dw 0xCA
	.dw 0xCC
	.dw 0xCE
	.dw 0xD0
	.dw 0xD2
	.dw 0xD4
	.dw 0xD6
	.dw 0xD8
	.dw 0xDA
	.dw 0xDC
	.dw 0xDE
	.dw 0xE0
	.dw 0xE2
	.dw 0xE4
	.dw 0xE6
	.dw 0xE8
	.dw 0xEA
	.dw 0xEC
	.dw 0xEE
	.dw 0xF0
	.dw 0xF2
	.dw 0xF4
	.dw 0xF6
	.dw 0xF8
	.dw 0xFA
	.dw 0xFC
	.dw 0xFE