HOOPS/3dGS I.M. Interface

     << Back      Full Index      Forward >>


hpspsf.h

00001 /*
00002  * Copyright (c) 2005 by Tech Soft 3D, LLC.
00003  * The information contained herein is confidential and proprietary to
00004  * Tech Soft 3D, LLC., and considered a trade secret as defined under
00005  * civil and criminal statutes.  Tech Soft 3D shall pursue its civil
00006  * and criminal remedies in the event of unauthorized use or misappropriation
00007  * of its trade secrets.  Use of this information by anyone other than
00008  * authorized employees of Tech Soft 3D, LLC. is granted only under a
00009  * written non-disclosure agreement, expressly prescribing the scope and
00010  * manner of such use.
00011  *
00012  * $Id: hpspsf_8h-source.html,v 1.29 2008-03-10 07:09:28 stage Exp $
00013  */
00014 
00015 #ifndef HOOPS_PROCESSOR_SPECIFIC_FUNCTIONALITY_DEFINED
00016 
00017 #ifdef USE_M_IX86_ASM
00018     /*
00019         Uppercase for macros, lowercase for straight asm
00020         Implicit indirection on general registers (ie, EAX means [eax]) used with these macros
00021      */
00022 
00023 #   ifdef __INTEL_COMPILER
00024 #       define  EAX             [eax]
00025 #       define  EBX             [ebx]
00026 #       define  ECX             [ecx]
00027 #       define  EDX             [edx]
00028 #       define  EDI             [edi]
00029         /* XMM* should be ok as-is */
00030 
00031 
00032 #       define  FXSAVE(d)               __asm   fxsave      d
00033 #       define  FXRSTOR(d)              __asm   fxrstor     d
00034 #       define  LDMXCSR(d)              __asm   ldmxcsr     d
00035 #       define  STMXCSR(d)              __asm   stmxcsr     d
00036 
00037 #       define  PREFETCHNTA(d)          __asm   prefetchnta d
00038 #       define  PREFETCHT0(d)           __asm   prefetcht0  d
00039 #       define  PREFETCHT1(d)           __asm   prefetcht1  d
00040 #       define  PREFETCHT2(d)           __asm   prefetcht2  d
00041 
00042 #       define  ADDPS(d,s)              __asm   addps       d, s
00043 #       define  SUBPS(d,s)              __asm   subps       d, s
00044 #       define  MULPS(d,s)              __asm   mulps       d, s
00045 #       define  DIVPS(d,s)              __asm   divps       d, s
00046 #       define  MINPS(d,s)              __asm   minps       d, s
00047 #       define  MAXPS(d,s)              __asm   maxps       d, s
00048 #       define  SQRTPS(d,s)             __asm   sqrtps      d, s
00049 #       define  RCPPS(d,s)              __asm   rcpps       d, s
00050 #       define  RSQRTPS(d,s)            __asm   rsqrtps     d, s
00051 
00052 #       define  ANDPS(d,s)              __asm   andps       d, s
00053 #       define  ANDNPS(d,s)             __asm   andnps      d, s
00054 #       define  ORPS(d,s)               __asm   orps        d, s
00055 #       define  XORPS(d,s)              __asm   xorps       d, s
00056 
00057 #       define  UNPCKHPS(d,s)           __asm   unpckhps    d, s
00058 #       define  UNPCKLPS(d,s)           __asm   unpcklps    d, s
00059 
00060 #       define  CMPPS(d,s,c)            __asm   cmpps       d, s, c
00061 #       define  SHUFPS(d,s,c)           __asm   shufps      d, s, c
00062 
00063 #       define  MOVHLPS(d,s)            __asm   movhlps     d, s
00064 #       define  MOVLHPS(d,s)            __asm   movlhps     d, s
00065 #       define  MOVLPS(d,s)             __asm   movlps      d, s
00066 #       define  MOVHPS(d,s)             __asm   movhps      d, s
00067 #       define  MOVAPS(d,s)             __asm   movaps      d, s
00068 #       define  MOVUPS(d,s)             __asm   movups      d, s
00069 
00070 #     if 0
00071 #       define  CVTDQ2PS(d,s)           __asm   cvtdq2ps    d, s
00072 
00073 #       define  CVTPI2PS(d,s)           __asm   cvtpi2ps    d, s
00074 #       define  CVTPS2PI(d,s)           __asm   cvtps2pi    d, s
00075 
00076 #       define  ADDSS(d,s)              __asm   addss       d, s
00077 #       define  SUBSS(d,s)              __asm   subss       d, s
00078 #       define  MULSS(d,s)              __asm   mulss       d, s
00079 #       define  DIVSS(d,s)              __asm   divss       d, s
00080 #       define  MINSS(d,s)              __asm   minss       d, s
00081 #       define  MAXSS(d,s)              __asm   maxss       d, s
00082 #       define  SQRTSS(d,s)             __asm   sqrtss      d, s
00083 #       define  RCPSS(d,s)              __asm   rcpss       d, s
00084 #       define  RSQRTSS(d,s)            __asm   rsqrtss     d, s
00085 
00086 #       define  CMPSS(d,s,c)            __asm   cmpss       d, s, c
00087 
00088 #       define  MOVSS(d,s)              __asm   movss       d, s
00089 
00090 #       define  CVTSI2SS(d,s)           __asm   cvtsi2ss    d, s
00091 #       define  CVTSS2SI(d,s)           __asm   cvtss2si    d, s
00092 
00093 #       define  CVTDQ2PS(d,s)           __asm   cvtdq2ps    d, s
00094 #       define  CVTPS2DQ(d,s)           __asm   cvtps2dq    d, s
00095 #       define  PAND(d,s)               __asm   pand        d, s
00096 #       define  PANDN(d,s)              __asm   pandn       d, s
00097 #       define  POR(d,s)                __asm   por         d, s
00098 #       define  PXOR(d,s)               __asm   pxor        d, s
00099 
00100 #       define  MOVSLDUP(d,s)           __asm   movsldup    d, s
00101 #       define  MOVSHDUP(d,s)           __asm   movshdup    d, s
00102 #       define  ADDSUBPS(d,s)           __asm   addsubps    d, s
00103 #       define  HADDPS(d,s)             __asm   haddps      d, s
00104 #       define  HSUBPS(d,s)             __asm   hsubps      d, s
00105 #     endif
00106 #   else
00107         /* high nibble Mod (and mask for certain operations), low nibble Reg or R/M */
00108 #       define  EAX             0x00
00109 #       define  EBX             0x03    /* yes, this is out of order... */
00110 #       define  ECX             0x01
00111 #       define  EDX             0x02
00112 #       define  EDI             0x07
00113 #       define  XMM0            0xF0
00114 #       define  XMM1            0xF1
00115 #       define  XMM2            0xF2
00116 #       define  XMM3            0xF3
00117 #       define  XMM4            0xF4
00118 #       define  XMM5            0xF5
00119 #       define  XMM6            0xF6
00120 #       define  XMM7            0xF7
00121 
00122 #       define  EMIT(x)                 __asm _emit x
00123 
00124         /*******************************************************************************/
00125         /* opcode prefix values, all SSE uses 0F, some have an additional prefix first */
00126 #       define  Op0F            EMIT(0x0F)      /* all  SSE  */
00127 #       define  Op66            EMIT(0x66)      /* some SSE2 */
00128 #       define  OpF2            EMIT(0xF2)      /* some SSE3 */
00129 #       define  OpF3            EMIT(0xF3)      /* all SS versions of PS */
00130 
00131         /* utility macros for addressing:
00132             Mod is 3 for XMM <- XMM, 0 if a straight memory reference (1 & 2 for displacements, currently unused)
00133             any memory reference is always in R/M
00134             otherwise destination goes in Reg
00135          */
00136 #       define  ModRegRM(mod,reg,rm)    EMIT (((mod & 3) << 6) | ((reg & 7) << 3) | (rm & 7))
00137 #       define  DS_Mod(d,s)     (((d & s) >> 4) & 3)
00138 #       define  DS_Reg(d,s)     ((d & (d >> 4)) | (s & ~(d >> 4)))
00139 #       define  DS_RM(d,s)      ((s & (d >> 4)) | (d & ~(d >> 4)))
00140 #       define  DS_ModRM(d,s)   ModRegRM (DS_Mod(d,s), DS_Reg(d,s), DS_RM(d,s))
00141 
00142         /* some opcodes like MOVxPS bump up 1 if source is XMM and destination is memory
00143             (could also bump if destination is XMM, but need to reverse args in addition)
00144          */
00145 #       define  BumpX2M(op,bmp,d,s)     (op  |  ((s >> 4) & (~d >> 4) & bmp))
00146 
00147 #       define  SSE_OP(op,d,s)          Op0F    EMIT (op)       DS_ModRM (d, s)
00148 #       define  SSE_PS(op,bmp,d,s)      SSE_OP (BumpX2M (op,bmp,d,s), d, s)
00149 #       define  SSE_SS(op,bmp,d,s)      OpF3    SSE_PS (op,bmp,d,s)
00150 
00151         /*******************************************************************************/
00152         /* SSE */
00153 
00154         /* control */
00155 #       define  FXSAVE(d)               SSE_OP (0xAE,d,0)               /* Float state -> [d] */
00156 #       define  FXRSTOR(d)              SSE_OP (0xAE,d,1)               /* Float state <- [d] */
00157 #       define  LDMXCSR(d)              SSE_OP (0xAE,d,2)               /* MXCSR <- d */
00158 #       define  STMXCSR(d)              SSE_OP (0xAE,d,3)               /* MXCSR -> d */
00159 
00160 #       define  PREFETCHNTA(d)          SSE_OP (0x18,d,0)               /* non-temporal */
00161 #       define  PREFETCHT0(d)           SSE_OP (0x18,d,1)               /* all cache levels */
00162 #       define  PREFETCHT1(d)           SSE_OP (0x18,d,2)               /* first-level cache */
00163 #       define  PREFETCHT2(d)           SSE_OP (0x18,d,3)               /* second-level cache */
00164 
00165         /* normal PS functions act on 4 single floats in parallel */
00166 #       define  ADDPS(d,s)              SSE_PS (0x58,0,d,s)             /* 4 x  d <- d +s */
00167 #       define  SUBPS(d,s)              SSE_PS (0x5C,0,d,s)             /* 4 x  d <- d -s */
00168 #       define  MULPS(d,s)              SSE_PS (0x59,0,d,s)             /* 4 x  d <- d *s */
00169 #       define  DIVPS(d,s)              SSE_PS (0x5E,0,d,s)             /* 4 x  d <- d /s */
00170 #       define  MINPS(d,s)              SSE_PS (0x5D,0,d,s)             /* 4 x  d <- min (d,s)  */
00171 #       define  MAXPS(d,s)              SSE_PS (0x5F,0,d,s)             /* 4 x  d <- max (d,s)  */
00172 #       define  SQRTPS(d,s)             SSE_PS (0x51,0,d,s)             /* 4 x  d <- sqrt(s) */
00173 #       define  RCPPS(d,s)              SSE_PS (0x53,0,d,s)             /* 4 x  d <~ 1/s */
00174 #       define  RSQRTPS(d,s)            SSE_PS (0x52,0,d,s)             /* 4 x  d <~ 1/sqrt(s) */
00175 
00176 #       define  ANDPS(d,s)              SSE_PS (0x54,0,d,s)             /* 4 x  d <- d & s */
00177 #       define  ANDNPS(d,s)             SSE_PS (0x55,0,d,s)             /* 4 x  d <- ~d & s */
00178 #       define  ORPS(d,s)               SSE_PS (0x56,0,d,s)             /* 4 x  d <- d | s */
00179 #       define  XORPS(d,s)              SSE_PS (0x57,0,d,s)             /* 4 x  d <- d ^ s */
00180 
00181 #       define  UNPCKHPS(d,s)           SSE_PS (0x15,0,d,s)             /* d <- (d2,s2,d3,s3) */
00182 #       define  UNPCKLPS(d,s)           SSE_PS (0x14,0,d,s)             /* d <- (d0,s0,d1,s1) */
00183 
00184 #       define  CMPPS(d,s,c)            SSE_PS (0xC2,0,d,s) EMIT (c)    /* 4 x  d <- d c s */
00185 #       define  SHUFPS(d,s,c)           SSE_PS (0xC6,0,d,s) EMIT (c)    /* d <- (d[op01], d[op23], s[op45], s[op67]) */
00186 
00187 #       define  MOVHLPS(d,s)            SSE_PS (0x12,0,d,s)             /* d01 <- s23 */
00188 #       define  MOVLHPS(d,s)            SSE_PS (0x16,0,d,s)             /* d23 <- s01 */
00189 #       define  MOVLPS(d,s)             SSE_PS (0x12,1,d,s)             /* d01 <- sm OR dm <- s01 */
00190 #       define  MOVHPS(d,s)             SSE_PS (0x16,1,d,s)             /* d23 <- sm OR dm <- s23 */
00191 #       define  MOVAPS(d,s)             SSE_PS (0x28,1,d,s)             /* 4 x  d <- s  (aligned) */
00192 #       define  MOVUPS(d,s)             SSE_PS (0x10,1,d,s)             /* 4 x  d <- s */
00193 
00194 #     if 0  /* currently only using SSE 4-float PS operations */
00195 #       define  CVTDQ2PS(d,s)           SSE_PS (0x5B,0,d,s)             /* 4 x  Float <- Int */
00196 
00197         /* limited PS functions acting on 2 single floats */
00198 #       define  CVTPI2PS(d,s)           SSE_PS (0x2A,0,d,s)             /* 2 x  Float <- Int */
00199 #       define  CVTPS2PI(d,s)           SSE_PS (0x2D,0,d,s)             /* 2 x  Int <- Float */
00200 
00201         /* normal SS functions act on 1 single float */
00202 #       define  ADDSS(d,s)              SSE_SS (0x58,0,d,s)             /* 1 x  d <- d +s */
00203 #       define  SUBSS(d,s)              SSE_SS (0x5C,0,d,s)             /* 1 x  d <- d -s */
00204 #       define  MULSS(d,s)              SSE_SS (0x59,0,d,s)             /* 1 x  d <- d *s */
00205 #       define  DIVSS(d,s)              SSE_SS (0x5E,0,d,s)             /* 1 x  d <- d /s */
00206 #       define  MINSS(d,s)              SSE_SS (0x5D,0,d,s)             /* 1 x  d <- min (d,s)  */
00207 #       define  MAXSS(d,s)              SSE_SS (0x5F,0,d,s)             /* 1 x  d <- max (d,s)  */
00208 #       define  SQRTSS(d,s)             SSE_SS (0x51,0,d,s)             /* 1 x  d <- sqrt(s) */
00209 #       define  RCPSS(d,s)              SSE_SS (0x53,0,d,s)             /* 1 x  d <~ 1/s */
00210 #       define  RSQRTSS(d,s)            SSE_SS (0x52,0,d,s)             /* 1 x  d <~ 1/sqrt(s) */
00211 
00212 #       define  CMPSS(d,s,c)            SSE_SS (0xC2,0,d,s) EMIT (c)    /* 1 x  d <- d c s */
00213 
00214 #       define  MOVSS(d,s)              SSE_SS (0x10,1,d,s)             /* d0 <- s0 OR d <- (sm0,0,0,0) */
00215 
00216 #       define  CVTSI2SS(d,s)           SSE_SS (0x5B,0,d,s)             /* 1 x  Float <- Int */
00217 #       define  CVTSS2SI(d,s)           SSE_SS (0x2D,0,d,s)             /* 1 x  Int <- Float */
00218 
00219         /* SSE2 */
00220 #       define  CVTDQ2PS(d,s)           SSE_PS (0x5B,0,d,s)             /* 4 x  Float <- Int */
00221 #       define  CVTPS2DQ(d,s)   Op66    SSE_PS (0x5B,0,d,s)             /* 4 x  Int <- Float */
00222 #       define  PAND(d,s)       Op66    SSE_PS (0xDB,0,d,s)             /* 4 x  d <- d & s */
00223 #       define  PANDN(d,s)      Op66    SSE_PS (0xDF,0,d,s)             /* 4 x  d <- ~d & s */
00224 #       define  POR(d,s)        Op66    SSE_PS (0xEB,0,d,s)             /* 4 x  d <-  d |s */
00225 #       define  PXOR(d,s)       Op66    SSE_PS (0xEF,0,d,s)             /* 4 x  d <-  d ^s */
00226 #       define  PSHUFD(d,s,c)   Op66    SSE_PS (0x70,0,d,s,c) EMIT (c)  /* d <- (s[op01], s[op23], s[op45], s[op67]) */
00227 
00228         /* SSE3 */
00229 #       define  MOVSLDUP(d,s)           SSE_SS (0x12,0,d,s)             /* d <- (s0,s0,s2,s2) */
00230 #       define  MOVSHDUP(d,s)           SSE_SS (0x16,0,d,s)             /* d <- (s1,s1,s3,s3) */
00231 #       define  ADDSUBPS(d,s)   OpF2    SSE_PS (0x58,0,d,s)             /* d <- (d0-s0, d1+s1, d2-s2, d3+s3) */
00232 #       define  HADDPS(d,s)     OpF2    SSE_PS (0x7C,0,d,s)             /* d <- (d0+d1, d2+d3, s0+s1, s2+s3) */
00233 #       define  HSUBPS(d,s)     OpF2    SSE_PS (0x7D,0,d,s)             /* d <- (d0-d1, d2-d3, s0-s1, s2-s3) */
00234 #     endif
00235 #   endif
00236 
00237     /* Pseudo-op versions of CMPxS */
00238 #   define  CMPEQPS(d,s)    CMPPS (d, s, 0) /* d == s */
00239 #   define  CMPLTPS(d,s)    CMPPS (d, s, 1) /* d < s */
00240 #   define  CMPLEPS(d,s)    CMPPS (d, s, 2) /* d <= s */
00241 #   define  CMPUNORDPS(d,s) CMPPS (d, s, 3) /* either d or s is NaN */
00242 #   define  CMPNEQPS(d,s)   CMPPS (d, s, 4) /* d != s */
00243 #   define  CMPNLTPS(d,s)   CMPPS (d, s, 5) /* d >= s */
00244 #   define  CMPNLEPS(d,s)   CMPPS (d, s, 6) /* d > s */
00245 #   define  CMPORDPS(d,s)   CMPPS (d, s, 7) /* neither d nor s is NaN */
00246 
00247 #   define  CMPEQSS(d,s)    CMPSS (d, s, 0) /* d == s */
00248 #   define  CMPLTSS(d,s)    CMPSS (d, s, 1) /* d < s */
00249 #   define  CMPLESS(d,s)    CMPSS (d, s, 2) /* d <= s */
00250 #   define  CMPUNORDSS(d,s) CMPSS (d, s, 3) /* either d or s is NaN */
00251 #   define  CMPNEQSS(d,s)   CMPSS (d, s, 4) /* d != s */
00252 #   define  CMPNLTSS(d,s)   CMPSS (d, s, 5) /* d >= s */
00253 #   define  CMPNLESS(d,s)   CMPSS (d, s, 6) /* d > s */
00254 #   define  CMPORDSS(d,s)   CMPSS (d, s, 7) /* neither d nor s is NaN */
00255 
00256     /* defines the selector byte 'c' for SHUFPS(d,s,c) if you don't want to just specify a hex value */
00257     /* d0 goes in low part of destination, s1 in high part */
00258 #   define  SHUFSEL(d0,d1,s0,s1)        ((s1<<6) | (s0<<4) | (d1<<2) | d0)
00259 
00260 #endif
00261 
00262 
00263 
00264 #if 0
00265 /* this was not building correctly
00266 #ifdef __ALTIVEC__
00267 */
00268     local inline vector float AddAcross4(vector float a, vector float b, vector float c, vector float d) {
00269         vector float tempA, tempB, tempC, tempD;
00270         
00271         /* First half of 4x4 a matrix transpose */
00272         tempA = vec_mergeh( a, c );     /* 1 {a0, c0, a1, c1} */
00273         tempC = vec_mergel( a, c );     /* 2 {a2, c2, a3, c3} */
00274         tempB = vec_mergeh( b, d );     /* 3 {b0, d0, b1, d1} */
00275         tempD = vec_mergel( b, d );     /* 4 {b2, d2, b3, d3} */
00276         
00277         /* Add intermediate values */
00278         b = vec_add( tempA, tempC );    /* 4 {a0 + a2, c0 + c2, a1 + a3, c1 + c3} */
00279         d = vec_add( tempB, tempD );    /* 6 {b0 + b2, d0 + d2, b1 + b3, d1 + d3} */
00280 
00281         /* Do half of the second half of the transpose */
00282         a = vec_mergeh( b, d );         /* 7 { a0 + a2, b0 + b2, c0 + c2, d0 + d2 } */
00283         c = vec_mergel( b, d );         /* 8 { a1 + a3, b1 + b3, c1 + c3, d1 + d3 } */
00284 
00285         /* Find the result */
00286         return vec_add( a, c );         /* 10 */
00287     }
00288 
00289     local inline vector unsigned char LoadUnaligned(int index, unsigned char *target) {
00290         vector unsigned char MSQ, LSQ;
00291         vector unsigned char mask;
00292 
00293         /* most significant quadword */
00294         MSQ = vec_ld(index, target);
00295 
00296         /* least significant quadword */
00297         LSQ = vec_ld(index + 16, target);
00298 
00299         /* create the permute mask */
00300         mask = vec_lvsl(0, target + index);
00301 
00302         /* return the permuted data */
00303         return vec_perm(MSQ, LSQ, mask);
00304     }
00305 
00306     local inline void StoreUnaligned(vector unsigned char src, int index, unsigned char *target) {
00307         vector unsigned char MSQ, LSQ;
00308         vector unsigned char mask, align, zero, neg1;
00309 
00310         MSQ = vec_ld(index, target);            /* most significant quadword */
00311         LSQ = vec_ld(index + 16, target);       /* least significant quadword */
00312 
00313         align = vec_lvsr(0, target + index);    /* create alignment vector */
00314         zero = vec_splat_u8( 0 );               /* Create vector full of zeros */
00315         neg1 = vec_splat_u8( -1 );              /* Create vector full of -1 */
00316 
00317         mask = vec_perm(zero, neg1, align);     /* Create select mask */
00318 
00319         src = vec_perm(src, src, align);        /* Right rotate stored data */
00320 
00321         MSQ = vec_sel(MSQ, src, mask);          /* Insert data into MSQ part */
00322         LSQ = vec_sel(src, LSQ, mask);          /* Insert data into LSQ part */
00323 
00324         vec_st(MSQ, index, target);             /* Store the MSQ part */
00325         vec_st(LSQ, index + 16, target);        /* Store the LSQ part */
00326     }
00327 #endif
00328 
00329 
00330 #define HOOPS_PROCESSOR_SPECIFIC_FUNCTIONALITY_DEFINED
00331 #endif
Main Index
HOOPS/3dGS I.M. Interface

     << Back      Full Index      Forward >>