/*
* muls16x16_32 (Ver. 1.0)
* Author: Nicholas Lombardo
* Date:	December 19, 2014
* Test the subroutine muls16x16_32 which will multiply two 16-bit signed numbers
*/
.INCLUDE <m328pdef.inc>

.DSEG

A:		.BYTE	2
B:		.BYTE	2
C:		.BYTE	4

.CSEG

// A and B values saved in FPM in a table
	ldi		r16,4				
	mov		r2,r16				// r2 will count iterations (4 numbers to check)
	ldi		ZH,high(table<<1)	
	ldi		ZL,low(table<<1)
	rjmp	loading				// jump over table

// Variable Loading: indirect transfer from table to A/B
loading:	
	lpm		r22,Z+
	lpm		r23,Z+
	sts		A+1,r23
	sts		A,r22
	lpm		r20,Z+
	lpm		r21,Z+
	sts		B+1,r21
	sts		B,r20
	clr		r31				// r31 used for spacing

// Calculate the product of A*B and push to stack
loop:
	rcall	muls16x16_32	// A*B
	push	r19				// save result on stack
	push	r18
	push	r17
	push	r16
	push	r31				// add space for next section
	dec		r2				// dec counter, load next pair
	brne	loading

// After 4 pairs tested, reload to GPR to review results.

// Rather than popping registers from the stack, Y pointer is used
// to indirectly load from the top of SRAM to r20 (from 0x08FF to Stack Pointer)  
// X pointer is then used to store r20 to to r0:r18. Uses FIFO ordering.

	ldi		YH,high(ramend)	// Y pointer will ld from top of SRAM (stack)
	ldi		YL,low(ramend)	
	clr		XH				// X pointer will st to GPRs (0x0001 = r1)
	ldi		XL,0x01
	ld		r0,Y

reload:
	ld		r21,-Y			// indirect load from SRAM stack (top --> bot)
	st		X+,r21			// indirect store to GPRs
	in		r20,SPL			// stop when Y = stack pointer
	cp		YL,r20
	brge	reload			

// clear other registers for legibility
	ldi		XL,19			// start at r19
clear:
	st		X+,r31			// r31 = 0x00
	cpi		XL,0x1A			// stop clearing at XL (r26)
	brne	clear
	clr		r26

// program done
end:
	rjmp 	end
;-----------------------------------------------


// 	  (MSByte) 					(LSByte)
// 1. r0:		r1:		r2:		r3 
// 2. r5:		r6:		r7:		r8
// 3. r10:		r11:	r12:	r13
// 4. r15:		r16:	r17:	r18

// Table of values for A and B
//						A				B
table:	.DW			8592,			-12643		// 1. A * B = 	 	  -108628656   	(F9 86 75 50)
		.DW		   -1333,			  1493		// 2.			   		-1990169   	(FF E1 A1 E7)
		.DW			 759,			    15		// 3.				 	   11385	(00 00 2C 79)
		.DW		  0x1074,			0x87B2		// 4. 4212 * -30798	= -129721176	(F8 44 9C A8)


/* Signed multiply of two 16-bit numbers with 32-bit result.
* Usage	
* Inputs:	r23:r22 * r21:r20
* Outputs:	r19:r18:r17:r16 
*/
muls16x16_32:
	push	r2
	clr 	r2
 	muls 	r23, r21 		// (signed)ah * (signed)bh
 	movw 	r19:r18, r1:r0
 	mul 	r22, r20 		// al * bl
 	movw 	r17:r16, r1:r0
 	mulsu 	r23, r20 		// (signed)ah * bl
 	sbc 	r19, r2
 	add 	r17, r0
 	adc 	r18, r1
 	adc 	r19, r2
 	mulsu 	r21, r22 		// (signed)bh * al
 	sbc 	r19, r2
 	add 	r17, r0
 	adc 	r18, r1
 	adc 	r19, r2
	pop		r2
 	ret