void ConvertColorsGamma(void * pTo,const void * pRGB,int count) { ASSERT( (count&3) == 0 ); ASSERT( ((intptr_t)pTo&0xF) == 0 ); ASSERT( ((intptr_t)pRGB&0xF) == 0 ); __asm { movaps xmm0,c_pq0 movaps xmm1,c_pq1 movaps xmm2,c_pq255 mov edi,pTo mov esi,pRGB mov ecx,count shr ecx,2 lp: movaps xmm4,[esi] // R,G,B,A movaps xmm5,[esi+16] // R,G,B,A movaps xmm6,[esi+32] // R,G,B,A movaps xmm7,[esi+48] // R,G,B,A // clamp to range 0-1 maxps xmm4, xmm0 maxps xmm5, xmm0 maxps xmm6, xmm0 maxps xmm7, xmm0 minps xmm4, xmm1 minps xmm5, xmm1 minps xmm6, xmm1 minps xmm7, xmm1 /* sqrtps xmm4,xmm4 // 000R000G000B000A sqrtps xmm5,xmm5 // 000R000G000B000A sqrtps xmm6,xmm6 // 000R000G000B000A sqrtps xmm7,xmm7 // 000R000G000B000A */ // rough quadratic approximation of sqrt : 2*x - x*x movaps xmm3,xmm4 mulps xmm3,xmm3 // x*x addps xmm4,xmm4 // 2x subps xmm4,xmm3 // 2x - x*x movaps xmm3,xmm5 mulps xmm3,xmm3 // x*x addps xmm5,xmm5 // 2x subps xmm5,xmm3 // 2x - x*x movaps xmm3,xmm6 mulps xmm3,xmm3 // x*x addps xmm6,xmm6 // 2x subps xmm6,xmm3 // 2x - x*x movaps xmm3,xmm7 mulps xmm3,xmm3 // x*x addps xmm7,xmm7 // 2x subps xmm7,xmm3 // 2x - x*x // scale by 255 : mulps xmm4,xmm2 mulps xmm5,xmm2 mulps xmm6,xmm2 mulps xmm7,xmm2 cvtps2dq xmm4,xmm4 // 000R000G000B000A cvtps2dq xmm5,xmm5 // 000R000G000B000A cvtps2dq xmm6,xmm6 // 000R000G000B000A cvtps2dq xmm7,xmm7 // 000R000G000B000A PACKSSDW xmm4,xmm5 // 4 <- {4,5} PACKSSDW xmm6,xmm7 // 6 <- {6,7} PACKUSWB xmm4,xmm6 // 4 <- {4,6} movaps [edi],xmm4 // and loop : add esi,64 add edi,16 dec ecx jnz lp } }