/*
 *             Automatically Tuned Linear Algebra Software v3.2
 *                      (C) Copyright 1999 Camm Maguire                      
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the University, the ATLAS group, or the names of its 
 *      contributers may not be used to endorse or promote products derived
 *      from this software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 *
 */

#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>

#include "camm_util.h"

#define PREFN 64
#define PREFN2 32
#define PREFA str(PREFN)
#define PREFA2 str(PREFN2)

#define C1 32
#define C2 64
#define C3 96
#define C4 128
#define C5 160
#define C6 192
#define C7 224
#define C8 256
#define C9 288

#define P1 C2
#define P2 C3
#define P3 C4
#define P4 C5
#define P5 C6
#define P6 C7
#define P7 C8
#define P8 C8


#define pfch(a_,b_)  "prefetchnta " #a_ "(%%e" #b_ ")\n\t"
/*  #define pfch(a_,b_)  "nop\n\t" */

#define lc(a_,b_)    "fldl " #a_ "(%%esi)\n\tfldl " #b_ "(%%esi)\n\t"
#define mul(a_,b_)   "fmul %%st(" #a_ "),%%st(" #b_ ")\n\t"
#define mulp(a_,b_)  "fmulp %%st(" #a_ "),%%st(" #b_ ")\n\t"
#define addp(a_)     "faddp %%st,%%st(" #a_ ")\n\t"
#define dup(a_)      "fld   %%st(" #a_ ")\n\t"
#define lb(a_,b_)    "fldl " #a_ "(%%e" #b_ ")\n\t"
#define xch(a_)      "fxch %%st(" #a_ ")\n\t"
#define subp(a_)     "fxch %%st(" #a_ ")\n\tfsubp %%st,%%st(" #a_ ")\n\t"
#define wb(a_,b_)    "fstpl " #a_ "(%%e" #b_ ")\n\t"

#ifdef Conj_
#define ssub           addp(1)
#define sadd(a_)       subp(a_)
#else
#define ssub           subp(1)
#define sadd(a_)       addp(a_)
#endif

#define dp1(a_,b_,c_,d_,e_)  lb(a_,c_) dup(1) mul(d_,0) addp(1) dup(2) mul(e_,0) ssub \
                             wb(a_,c_) lb(b_,c_) dup(1) mul(e_,0) sadd(1) dup(2) mul(d_,0)\
                             addp(1) wb(b_,c_)
#define dp2(a_,b_,c_,d_,e_,f_)  lb(a_,c_) dup(1) mul(d_,0) addp(1) dup(2) mul(e_,0) ssub \
                                wb(a_,c_) lb(b_,c_) xch(2) mul(f_,0) addp(2) mul(f_,0) \
                                sadd(1) wb(b_,c_)

#define dpp1(a_,b_,c_,d_,e_,f_,g_)  lb(a_,c_) dup(1) mul(d_,0) addp(1) dup(2) mul(e_,0)\
                                    ssub wb(a_,c_) pfch(f_,g_) lb(b_,c_) dup(1) mul(e_,0)\
                                    sadd(1) dup(2) mul(d_,0) addp(1) wb(b_,c_)
#define dpp2(a_,b_,c_,d_,e_,f_,g_,h_) lb(a_,c_) dup(1) mul(d_,0) addp(1) dup(2) mul(e_,0)\
                                      ssub wb(a_,c_) pfch(g_,h_) lb(b_,c_) xch(2)\
                                      mul(f_,0) addp(2) mul(f_,0) sadd(1) wb(b_,c_)

#define bla1(a_,b_,c_)  lc(b_,a_) dpp2(a_,b_,ax,4,5,3,c_,si)
#define blb1(a_,b_,c_)  lc(b_,a_) dpp2(a_,b_,ax,4,5,3,c_,ax)

#define bla2(a_,b_,c_)  lc(b_,a_) dpp1(a_,b_,ax,6,7,c_,si) dpp2(a_,b_,bx,4,5,3,c_,ax)
#define blb2(a_,b_,c_)  lc(b_,a_) dpp1(a_,b_,ax,6,7,c_,bx) dp2(a_,b_,bx,4,5,3)


#define bla(a_,b_,c_)      Mjoin(bla,NDP)(a_,b_,c_)
#define blb(a_,b_,c_)      Mjoin(blb,NDP)(a_,b_,c_)


#define bl1            bla(0,8,P1)
#define bl2            bl1 blb(16,24,P1)
#define bl4            bl2 bla(32,40,P2) blb(48,56,P2)
#define bl8            bl4 bla(64,72,P3) blb(80,88,P3) bla(96,104,P4) blb(112,120,P4)


#define inca(a_)     "addl $" #a_ ",%%esi\n\t"
#define incba(a_,b_) "addl $" #a_ ",%%e" #b_ "\n\t"
#define incb1(a_)    incba(a_,ax)
#define incb2(a_)    incb1(a_) incba(a_,bx)
                    
#define inc(a_)      inca(a_) Mjoin(incb,NDP)(a_)

#define inc1a         "addl $8,%%esi\n\t"
#define inc1ba(a_)    "addl $8,%%e" #a_ "\n\t"
#define inc1b1        inc1ba(ax)
#define inc1b2        inc1b1 inc1ba(bx)
                    
#define inc1          inc1a Mjoin(incb,NDP)


#define laa(a_)      "fldl " #a_ "(%%esi)\n\t"
#define na           "addl %%edi,%%esi\n\t"
#define la1          laa(8) laa(0)
#define la2          la1 na laa(8) laa(0)

#define la           Mjoin(la,NDP)

#define ulaa(a_)     "fstp %%st\n\t"
#define pa           "subl %%edi,%%esi\n\t"
#define ula1         pa ulaa(0) ulaa(8)
#define ula2         pa ulaa(0) ulaa(8) ula1

#define ula          Mjoin(ula,NDP)

#define lpba(a_)      "movl %%esi,%%e" #a_ "\n\t"
#define npb           "addl %%edi,%%esi\n\t"

#define lpb1          lpba(ax)
#define lpb2          lpb1 npb lpba(bx)

#define lpb           Mjoin(lpb,NDP)

#define ipfch1(a_)   pfch(a_,si) pfch(a_,ax)
#define ipfch2(a_)   ipfch1(a_)  pfch(a_,bx) 

#define ipfch(a_)     Mjoin(ipfch,NDP)(a_)

static void
Mjoin(g,EXT)(const Dcomplex *a,int ainc,Dcomplex *b,int ldb,const Dcomplex *c,int len) {

    const Dcomplex *ae=a+len;
    int i,a2b,b2b,a2a;
    NO_INLINE;

    a2b=(b-a)*sizeof(*a);
    b2b=ldb*sizeof(*b);
    a2a=ainc*sizeof(*a);

    ASM (

	 "movl %0,%%esi\n\t"  /* a */
	 "movl %1,%%edi\n\t"  /* a2a */

	 la

	 "movl %2,%%esi\n\t"  /* b */
	 "movl %3,%%edi\n\t"  /* b2b */

	 lpb

	 ipfch(0)

	 "movl %4,%%esi\n\t"  /* c */
	 "movl %5,%%edi\n\t"  /* len */

	 ipfch(32)
	 
	 lab(loop)
	 test(-4,di)
	 je(2)

	 sub(4,di)
	 align

	 bl4

	 inc(64)

	 jmp(loop)
	 align

/*  #undef pfch */
/*  #define pfch(a_,b_) */

/*  	 lab(8) */

/*  	 test(8,di) */
/*  	 je(4) */

/*  	 bl8 */
/*  	 inc(64) */

/*  	 lab(4) */

/*  	 test(4,di) */
/*  	 je(2) */

/*  	 bl4 */
/*  	 inc(32) */

	 lab(2)

	 test(2,di)
	 je(1)

	 bl2
	 inc(32)

	 lab(1)

	 test(1,di)
	 je(stop)

	 bl1

	 lab(stop)


	 "movl %0,%%esi\n\t"
	 "movl %1,%%edi\n\t"
	 "movl %%edi,%%edx\n\t"
	 "imul $" str(NDP) ",%%edx\n\t" 
	 "addl %%edx,%%esi\n\t"

	 ula

	 ::"m" (a),"m" (a2a),"m" (b),"m" (b2b),"m" (c),"m" (len)
	 :"ax","bx","cx","dx","si","di");

}

