00001 /* 00002 * Copyright (c) 2005 by Tech Soft 3D, LLC. 00003 * The information contained herein is confidential and proprietary to 00004 * Tech Soft 3D, LLC., and considered a trade secret as defined under 00005 * civil and criminal statutes. Tech Soft 3D shall pursue its civil 00006 * and criminal remedies in the event of unauthorized use or misappropriation 00007 * of its trade secrets. Use of this information by anyone other than 00008 * authorized employees of Tech Soft 3D, LLC. is granted only under a 00009 * written non-disclosure agreement, expressly prescribing the scope and 00010 * manner of such use. 00011 * 00012 * $Id: hpspsf_8h-source.html,v 1.29 2008-03-10 07:09:28 stage Exp $ 00013 */ 00014 00015 #ifndef HOOPS_PROCESSOR_SPECIFIC_FUNCTIONALITY_DEFINED 00016 00017 #ifdef USE_M_IX86_ASM 00018 /* 00019 Uppercase for macros, lowercase for straight asm 00020 Implicit indirection on general registers (ie, EAX means [eax]) used with these macros 00021 */ 00022 00023 # ifdef __INTEL_COMPILER 00024 # define EAX [eax] 00025 # define EBX [ebx] 00026 # define ECX [ecx] 00027 # define EDX [edx] 00028 # define EDI [edi] 00029 /* XMM* should be ok as-is */ 00030 00031 00032 # define FXSAVE(d) __asm fxsave d 00033 # define FXRSTOR(d) __asm fxrstor d 00034 # define LDMXCSR(d) __asm ldmxcsr d 00035 # define STMXCSR(d) __asm stmxcsr d 00036 00037 # define PREFETCHNTA(d) __asm prefetchnta d 00038 # define PREFETCHT0(d) __asm prefetcht0 d 00039 # define PREFETCHT1(d) __asm prefetcht1 d 00040 # define PREFETCHT2(d) __asm prefetcht2 d 00041 00042 # define ADDPS(d,s) __asm addps d, s 00043 # define SUBPS(d,s) __asm subps d, s 00044 # define MULPS(d,s) __asm mulps d, s 00045 # define DIVPS(d,s) __asm divps d, s 00046 # define MINPS(d,s) __asm minps d, s 00047 # define MAXPS(d,s) __asm maxps d, s 00048 # define SQRTPS(d,s) __asm sqrtps d, s 00049 # define RCPPS(d,s) __asm rcpps d, s 00050 # define RSQRTPS(d,s) __asm rsqrtps d, s 00051 00052 # define ANDPS(d,s) __asm andps d, s 00053 # define ANDNPS(d,s) __asm andnps d, s 00054 # define ORPS(d,s) __asm orps d, s 00055 # define XORPS(d,s) __asm xorps d, s 00056 00057 # define UNPCKHPS(d,s) __asm unpckhps d, s 00058 # define UNPCKLPS(d,s) __asm unpcklps d, s 00059 00060 # define CMPPS(d,s,c) __asm cmpps d, s, c 00061 # define SHUFPS(d,s,c) __asm shufps d, s, c 00062 00063 # define MOVHLPS(d,s) __asm movhlps d, s 00064 # define MOVLHPS(d,s) __asm movlhps d, s 00065 # define MOVLPS(d,s) __asm movlps d, s 00066 # define MOVHPS(d,s) __asm movhps d, s 00067 # define MOVAPS(d,s) __asm movaps d, s 00068 # define MOVUPS(d,s) __asm movups d, s 00069 00070 # if 0 00071 # define CVTDQ2PS(d,s) __asm cvtdq2ps d, s 00072 00073 # define CVTPI2PS(d,s) __asm cvtpi2ps d, s 00074 # define CVTPS2PI(d,s) __asm cvtps2pi d, s 00075 00076 # define ADDSS(d,s) __asm addss d, s 00077 # define SUBSS(d,s) __asm subss d, s 00078 # define MULSS(d,s) __asm mulss d, s 00079 # define DIVSS(d,s) __asm divss d, s 00080 # define MINSS(d,s) __asm minss d, s 00081 # define MAXSS(d,s) __asm maxss d, s 00082 # define SQRTSS(d,s) __asm sqrtss d, s 00083 # define RCPSS(d,s) __asm rcpss d, s 00084 # define RSQRTSS(d,s) __asm rsqrtss d, s 00085 00086 # define CMPSS(d,s,c) __asm cmpss d, s, c 00087 00088 # define MOVSS(d,s) __asm movss d, s 00089 00090 # define CVTSI2SS(d,s) __asm cvtsi2ss d, s 00091 # define CVTSS2SI(d,s) __asm cvtss2si d, s 00092 00093 # define CVTDQ2PS(d,s) __asm cvtdq2ps d, s 00094 # define CVTPS2DQ(d,s) __asm cvtps2dq d, s 00095 # define PAND(d,s) __asm pand d, s 00096 # define PANDN(d,s) __asm pandn d, s 00097 # define POR(d,s) __asm por d, s 00098 # define PXOR(d,s) __asm pxor d, s 00099 00100 # define MOVSLDUP(d,s) __asm movsldup d, s 00101 # define MOVSHDUP(d,s) __asm movshdup d, s 00102 # define ADDSUBPS(d,s) __asm addsubps d, s 00103 # define HADDPS(d,s) __asm haddps d, s 00104 # define HSUBPS(d,s) __asm hsubps d, s 00105 # endif 00106 # else 00107 /* high nibble Mod (and mask for certain operations), low nibble Reg or R/M */ 00108 # define EAX 0x00 00109 # define EBX 0x03 /* yes, this is out of order... */ 00110 # define ECX 0x01 00111 # define EDX 0x02 00112 # define EDI 0x07 00113 # define XMM0 0xF0 00114 # define XMM1 0xF1 00115 # define XMM2 0xF2 00116 # define XMM3 0xF3 00117 # define XMM4 0xF4 00118 # define XMM5 0xF5 00119 # define XMM6 0xF6 00120 # define XMM7 0xF7 00121 00122 # define EMIT(x) __asm _emit x 00123 00124 /*******************************************************************************/ 00125 /* opcode prefix values, all SSE uses 0F, some have an additional prefix first */ 00126 # define Op0F EMIT(0x0F) /* all SSE */ 00127 # define Op66 EMIT(0x66) /* some SSE2 */ 00128 # define OpF2 EMIT(0xF2) /* some SSE3 */ 00129 # define OpF3 EMIT(0xF3) /* all SS versions of PS */ 00130 00131 /* utility macros for addressing: 00132 Mod is 3 for XMM <- XMM, 0 if a straight memory reference (1 & 2 for displacements, currently unused) 00133 any memory reference is always in R/M 00134 otherwise destination goes in Reg 00135 */ 00136 # define ModRegRM(mod,reg,rm) EMIT (((mod & 3) << 6) | ((reg & 7) << 3) | (rm & 7)) 00137 # define DS_Mod(d,s) (((d & s) >> 4) & 3) 00138 # define DS_Reg(d,s) ((d & (d >> 4)) | (s & ~(d >> 4))) 00139 # define DS_RM(d,s) ((s & (d >> 4)) | (d & ~(d >> 4))) 00140 # define DS_ModRM(d,s) ModRegRM (DS_Mod(d,s), DS_Reg(d,s), DS_RM(d,s)) 00141 00142 /* some opcodes like MOVxPS bump up 1 if source is XMM and destination is memory 00143 (could also bump if destination is XMM, but need to reverse args in addition) 00144 */ 00145 # define BumpX2M(op,bmp,d,s) (op | ((s >> 4) & (~d >> 4) & bmp)) 00146 00147 # define SSE_OP(op,d,s) Op0F EMIT (op) DS_ModRM (d, s) 00148 # define SSE_PS(op,bmp,d,s) SSE_OP (BumpX2M (op,bmp,d,s), d, s) 00149 # define SSE_SS(op,bmp,d,s) OpF3 SSE_PS (op,bmp,d,s) 00150 00151 /*******************************************************************************/ 00152 /* SSE */ 00153 00154 /* control */ 00155 # define FXSAVE(d) SSE_OP (0xAE,d,0) /* Float state -> [d] */ 00156 # define FXRSTOR(d) SSE_OP (0xAE,d,1) /* Float state <- [d] */ 00157 # define LDMXCSR(d) SSE_OP (0xAE,d,2) /* MXCSR <- d */ 00158 # define STMXCSR(d) SSE_OP (0xAE,d,3) /* MXCSR -> d */ 00159 00160 # define PREFETCHNTA(d) SSE_OP (0x18,d,0) /* non-temporal */ 00161 # define PREFETCHT0(d) SSE_OP (0x18,d,1) /* all cache levels */ 00162 # define PREFETCHT1(d) SSE_OP (0x18,d,2) /* first-level cache */ 00163 # define PREFETCHT2(d) SSE_OP (0x18,d,3) /* second-level cache */ 00164 00165 /* normal PS functions act on 4 single floats in parallel */ 00166 # define ADDPS(d,s) SSE_PS (0x58,0,d,s) /* 4 x d <- d +s */ 00167 # define SUBPS(d,s) SSE_PS (0x5C,0,d,s) /* 4 x d <- d -s */ 00168 # define MULPS(d,s) SSE_PS (0x59,0,d,s) /* 4 x d <- d *s */ 00169 # define DIVPS(d,s) SSE_PS (0x5E,0,d,s) /* 4 x d <- d /s */ 00170 # define MINPS(d,s) SSE_PS (0x5D,0,d,s) /* 4 x d <- min (d,s) */ 00171 # define MAXPS(d,s) SSE_PS (0x5F,0,d,s) /* 4 x d <- max (d,s) */ 00172 # define SQRTPS(d,s) SSE_PS (0x51,0,d,s) /* 4 x d <- sqrt(s) */ 00173 # define RCPPS(d,s) SSE_PS (0x53,0,d,s) /* 4 x d <~ 1/s */ 00174 # define RSQRTPS(d,s) SSE_PS (0x52,0,d,s) /* 4 x d <~ 1/sqrt(s) */ 00175 00176 # define ANDPS(d,s) SSE_PS (0x54,0,d,s) /* 4 x d <- d & s */ 00177 # define ANDNPS(d,s) SSE_PS (0x55,0,d,s) /* 4 x d <- ~d & s */ 00178 # define ORPS(d,s) SSE_PS (0x56,0,d,s) /* 4 x d <- d | s */ 00179 # define XORPS(d,s) SSE_PS (0x57,0,d,s) /* 4 x d <- d ^ s */ 00180 00181 # define UNPCKHPS(d,s) SSE_PS (0x15,0,d,s) /* d <- (d2,s2,d3,s3) */ 00182 # define UNPCKLPS(d,s) SSE_PS (0x14,0,d,s) /* d <- (d0,s0,d1,s1) */ 00183 00184 # define CMPPS(d,s,c) SSE_PS (0xC2,0,d,s) EMIT (c) /* 4 x d <- d c s */ 00185 # define SHUFPS(d,s,c) SSE_PS (0xC6,0,d,s) EMIT (c) /* d <- (d[op01], d[op23], s[op45], s[op67]) */ 00186 00187 # define MOVHLPS(d,s) SSE_PS (0x12,0,d,s) /* d01 <- s23 */ 00188 # define MOVLHPS(d,s) SSE_PS (0x16,0,d,s) /* d23 <- s01 */ 00189 # define MOVLPS(d,s) SSE_PS (0x12,1,d,s) /* d01 <- sm OR dm <- s01 */ 00190 # define MOVHPS(d,s) SSE_PS (0x16,1,d,s) /* d23 <- sm OR dm <- s23 */ 00191 # define MOVAPS(d,s) SSE_PS (0x28,1,d,s) /* 4 x d <- s (aligned) */ 00192 # define MOVUPS(d,s) SSE_PS (0x10,1,d,s) /* 4 x d <- s */ 00193 00194 # if 0 /* currently only using SSE 4-float PS operations */ 00195 # define CVTDQ2PS(d,s) SSE_PS (0x5B,0,d,s) /* 4 x Float <- Int */ 00196 00197 /* limited PS functions acting on 2 single floats */ 00198 # define CVTPI2PS(d,s) SSE_PS (0x2A,0,d,s) /* 2 x Float <- Int */ 00199 # define CVTPS2PI(d,s) SSE_PS (0x2D,0,d,s) /* 2 x Int <- Float */ 00200 00201 /* normal SS functions act on 1 single float */ 00202 # define ADDSS(d,s) SSE_SS (0x58,0,d,s) /* 1 x d <- d +s */ 00203 # define SUBSS(d,s) SSE_SS (0x5C,0,d,s) /* 1 x d <- d -s */ 00204 # define MULSS(d,s) SSE_SS (0x59,0,d,s) /* 1 x d <- d *s */ 00205 # define DIVSS(d,s) SSE_SS (0x5E,0,d,s) /* 1 x d <- d /s */ 00206 # define MINSS(d,s) SSE_SS (0x5D,0,d,s) /* 1 x d <- min (d,s) */ 00207 # define MAXSS(d,s) SSE_SS (0x5F,0,d,s) /* 1 x d <- max (d,s) */ 00208 # define SQRTSS(d,s) SSE_SS (0x51,0,d,s) /* 1 x d <- sqrt(s) */ 00209 # define RCPSS(d,s) SSE_SS (0x53,0,d,s) /* 1 x d <~ 1/s */ 00210 # define RSQRTSS(d,s) SSE_SS (0x52,0,d,s) /* 1 x d <~ 1/sqrt(s) */ 00211 00212 # define CMPSS(d,s,c) SSE_SS (0xC2,0,d,s) EMIT (c) /* 1 x d <- d c s */ 00213 00214 # define MOVSS(d,s) SSE_SS (0x10,1,d,s) /* d0 <- s0 OR d <- (sm0,0,0,0) */ 00215 00216 # define CVTSI2SS(d,s) SSE_SS (0x5B,0,d,s) /* 1 x Float <- Int */ 00217 # define CVTSS2SI(d,s) SSE_SS (0x2D,0,d,s) /* 1 x Int <- Float */ 00218 00219 /* SSE2 */ 00220 # define CVTDQ2PS(d,s) SSE_PS (0x5B,0,d,s) /* 4 x Float <- Int */ 00221 # define CVTPS2DQ(d,s) Op66 SSE_PS (0x5B,0,d,s) /* 4 x Int <- Float */ 00222 # define PAND(d,s) Op66 SSE_PS (0xDB,0,d,s) /* 4 x d <- d & s */ 00223 # define PANDN(d,s) Op66 SSE_PS (0xDF,0,d,s) /* 4 x d <- ~d & s */ 00224 # define POR(d,s) Op66 SSE_PS (0xEB,0,d,s) /* 4 x d <- d |s */ 00225 # define PXOR(d,s) Op66 SSE_PS (0xEF,0,d,s) /* 4 x d <- d ^s */ 00226 # define PSHUFD(d,s,c) Op66 SSE_PS (0x70,0,d,s,c) EMIT (c) /* d <- (s[op01], s[op23], s[op45], s[op67]) */ 00227 00228 /* SSE3 */ 00229 # define MOVSLDUP(d,s) SSE_SS (0x12,0,d,s) /* d <- (s0,s0,s2,s2) */ 00230 # define MOVSHDUP(d,s) SSE_SS (0x16,0,d,s) /* d <- (s1,s1,s3,s3) */ 00231 # define ADDSUBPS(d,s) OpF2 SSE_PS (0x58,0,d,s) /* d <- (d0-s0, d1+s1, d2-s2, d3+s3) */ 00232 # define HADDPS(d,s) OpF2 SSE_PS (0x7C,0,d,s) /* d <- (d0+d1, d2+d3, s0+s1, s2+s3) */ 00233 # define HSUBPS(d,s) OpF2 SSE_PS (0x7D,0,d,s) /* d <- (d0-d1, d2-d3, s0-s1, s2-s3) */ 00234 # endif 00235 # endif 00236 00237 /* Pseudo-op versions of CMPxS */ 00238 # define CMPEQPS(d,s) CMPPS (d, s, 0) /* d == s */ 00239 # define CMPLTPS(d,s) CMPPS (d, s, 1) /* d < s */ 00240 # define CMPLEPS(d,s) CMPPS (d, s, 2) /* d <= s */ 00241 # define CMPUNORDPS(d,s) CMPPS (d, s, 3) /* either d or s is NaN */ 00242 # define CMPNEQPS(d,s) CMPPS (d, s, 4) /* d != s */ 00243 # define CMPNLTPS(d,s) CMPPS (d, s, 5) /* d >= s */ 00244 # define CMPNLEPS(d,s) CMPPS (d, s, 6) /* d > s */ 00245 # define CMPORDPS(d,s) CMPPS (d, s, 7) /* neither d nor s is NaN */ 00246 00247 # define CMPEQSS(d,s) CMPSS (d, s, 0) /* d == s */ 00248 # define CMPLTSS(d,s) CMPSS (d, s, 1) /* d < s */ 00249 # define CMPLESS(d,s) CMPSS (d, s, 2) /* d <= s */ 00250 # define CMPUNORDSS(d,s) CMPSS (d, s, 3) /* either d or s is NaN */ 00251 # define CMPNEQSS(d,s) CMPSS (d, s, 4) /* d != s */ 00252 # define CMPNLTSS(d,s) CMPSS (d, s, 5) /* d >= s */ 00253 # define CMPNLESS(d,s) CMPSS (d, s, 6) /* d > s */ 00254 # define CMPORDSS(d,s) CMPSS (d, s, 7) /* neither d nor s is NaN */ 00255 00256 /* defines the selector byte 'c' for SHUFPS(d,s,c) if you don't want to just specify a hex value */ 00257 /* d0 goes in low part of destination, s1 in high part */ 00258 # define SHUFSEL(d0,d1,s0,s1) ((s1<<6) | (s0<<4) | (d1<<2) | d0) 00259 00260 #endif 00261 00262 00263 00264 #if 0 00265 /* this was not building correctly 00266 #ifdef __ALTIVEC__ 00267 */ 00268 local inline vector float AddAcross4(vector float a, vector float b, vector float c, vector float d) { 00269 vector float tempA, tempB, tempC, tempD; 00270 00271 /* First half of 4x4 a matrix transpose */ 00272 tempA = vec_mergeh( a, c ); /* 1 {a0, c0, a1, c1} */ 00273 tempC = vec_mergel( a, c ); /* 2 {a2, c2, a3, c3} */ 00274 tempB = vec_mergeh( b, d ); /* 3 {b0, d0, b1, d1} */ 00275 tempD = vec_mergel( b, d ); /* 4 {b2, d2, b3, d3} */ 00276 00277 /* Add intermediate values */ 00278 b = vec_add( tempA, tempC ); /* 4 {a0 + a2, c0 + c2, a1 + a3, c1 + c3} */ 00279 d = vec_add( tempB, tempD ); /* 6 {b0 + b2, d0 + d2, b1 + b3, d1 + d3} */ 00280 00281 /* Do half of the second half of the transpose */ 00282 a = vec_mergeh( b, d ); /* 7 { a0 + a2, b0 + b2, c0 + c2, d0 + d2 } */ 00283 c = vec_mergel( b, d ); /* 8 { a1 + a3, b1 + b3, c1 + c3, d1 + d3 } */ 00284 00285 /* Find the result */ 00286 return vec_add( a, c ); /* 10 */ 00287 } 00288 00289 local inline vector unsigned char LoadUnaligned(int index, unsigned char *target) { 00290 vector unsigned char MSQ, LSQ; 00291 vector unsigned char mask; 00292 00293 /* most significant quadword */ 00294 MSQ = vec_ld(index, target); 00295 00296 /* least significant quadword */ 00297 LSQ = vec_ld(index + 16, target); 00298 00299 /* create the permute mask */ 00300 mask = vec_lvsl(0, target + index); 00301 00302 /* return the permuted data */ 00303 return vec_perm(MSQ, LSQ, mask); 00304 } 00305 00306 local inline void StoreUnaligned(vector unsigned char src, int index, unsigned char *target) { 00307 vector unsigned char MSQ, LSQ; 00308 vector unsigned char mask, align, zero, neg1; 00309 00310 MSQ = vec_ld(index, target); /* most significant quadword */ 00311 LSQ = vec_ld(index + 16, target); /* least significant quadword */ 00312 00313 align = vec_lvsr(0, target + index); /* create alignment vector */ 00314 zero = vec_splat_u8( 0 ); /* Create vector full of zeros */ 00315 neg1 = vec_splat_u8( -1 ); /* Create vector full of -1 */ 00316 00317 mask = vec_perm(zero, neg1, align); /* Create select mask */ 00318 00319 src = vec_perm(src, src, align); /* Right rotate stored data */ 00320 00321 MSQ = vec_sel(MSQ, src, mask); /* Insert data into MSQ part */ 00322 LSQ = vec_sel(src, LSQ, mask); /* Insert data into LSQ part */ 00323 00324 vec_st(MSQ, index, target); /* Store the MSQ part */ 00325 vec_st(LSQ, index + 16, target); /* Store the LSQ part */ 00326 } 00327 #endif 00328 00329 00330 #define HOOPS_PROCESSOR_SPECIFIC_FUNCTIONALITY_DEFINED 00331 #endif