/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) __declspec(naked) void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { __asm { push edi push esi push ebp mov eax, [esp + 12 + 4] // src mov edi, [esp + 12 + 8] // src_stride mov edx, [esp + 12 + 12] // dst mov esi, [esp + 12 + 16] // dst_stride mov ecx, [esp + 12 + 20] // width // Read in the data from the source pointer. // First round of bit swap. align 4 convertloop: movq xmm0, qword ptr [eax] lea ebp, [eax + 8] movq xmm1, qword ptr [eax + edi] lea eax, [eax + 2 * edi] punpcklbw xmm0, xmm1 movq xmm2, qword ptr [eax] movdqa xmm1, xmm0 palignr xmm1, xmm1, 8 movq xmm3, qword ptr [eax + edi] lea eax, [eax + 2 * edi] punpcklbw xmm2, xmm3 movdqa xmm3, xmm2 movq xmm4, qword ptr [eax] palignr xmm3, xmm3, 8 movq xmm5, qword ptr [eax + edi] punpcklbw xmm4, xmm5 lea eax, [eax + 2 * edi] movdqa xmm5, xmm4 movq xmm6, qword ptr [eax] palignr xmm5, xmm5, 8 movq xmm7, qword ptr [eax + edi] punpcklbw xmm6, xmm7 mov eax, ebp movdqa xmm7, xmm6 palignr xmm7, xmm7, 8 // Second round of bit swap. punpcklwd xmm0, xmm2 punpcklwd xmm1, xmm3 movdqa xmm2, xmm0 movdqa xmm3, xmm1 palignr xmm2, xmm2, 8 palignr xmm3, xmm3, 8 punpcklwd xmm4, xmm6 punpcklwd xmm5, xmm7 movdqa xmm6, xmm4 movdqa xmm7, xmm5 palignr xmm6, xmm6, 8 palignr xmm7, xmm7, 8 // Third round of bit swap. // Write to the destination pointer. punpckldq xmm0, xmm4 movq qword ptr [edx], xmm0 movdqa xmm4, xmm0 palignr xmm4, xmm4, 8 movq qword ptr [edx + esi], xmm4 lea edx, [edx + 2 * esi] punpckldq xmm2, xmm6 movdqa xmm6, xmm2 palignr xmm6, xmm6, 8 movq qword ptr [edx], xmm2 punpckldq xmm1, xmm5 movq qword ptr [edx + esi], xmm6 lea edx, [edx + 2 * esi] movdqa xmm5, xmm1 movq qword ptr [edx], xmm1 palignr xmm5, xmm5, 8 punpckldq xmm3, xmm7 movq qword ptr [edx + esi], xmm5 lea edx, [edx + 2 * esi] movq qword ptr [edx], xmm3 movdqa xmm7, xmm3 palignr xmm7, xmm7, 8 sub ecx, 8 movq qword ptr [edx + esi], xmm7 lea edx, [edx + 2 * esi] jg convertloop pop ebp pop esi pop edi ret } } __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int w) { __asm { push ebx push esi push edi push ebp mov eax, [esp + 16 + 4] // src mov edi, [esp + 16 + 8] // src_stride mov edx, [esp + 16 + 12] // dst_a mov esi, [esp + 16 + 16] // dst_stride_a mov ebx, [esp + 16 + 20] // dst_b mov ebp, [esp + 16 + 24] // dst_stride_b mov ecx, esp sub esp, 4 + 16 and esp, ~15 mov [esp + 16], ecx mov ecx, [ecx + 16 + 28] // w align 4 // Read in the data from the source pointer. // First round of bit swap. convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm0 // use xmm7 as temp register. punpcklbw xmm0, xmm1 punpckhbw xmm7, xmm1 movdqa xmm1, xmm7 movdqu xmm2, [eax] movdqu xmm3, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm2 punpcklbw xmm2, xmm3 punpckhbw xmm7, xmm3 movdqa xmm3, xmm7 movdqu xmm4, [eax] movdqu xmm5, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm4 punpcklbw xmm4, xmm5 punpckhbw xmm7, xmm5 movdqa xmm5, xmm7 movdqu xmm6, [eax] movdqu xmm7, [eax + edi] lea eax, [eax + 2 * edi] movdqu [esp], xmm5 // backup xmm5 neg edi movdqa xmm5, xmm6 // use xmm5 as temp register. punpcklbw xmm6, xmm7 punpckhbw xmm5, xmm7 movdqa xmm7, xmm5 lea eax, [eax + 8 * edi + 16] neg edi // Second round of bit swap. movdqa xmm5, xmm0 punpcklwd xmm0, xmm2 punpckhwd xmm5, xmm2 movdqa xmm2, xmm5 movdqa xmm5, xmm1 punpcklwd xmm1, xmm3 punpckhwd xmm5, xmm3 movdqa xmm3, xmm5 movdqa xmm5, xmm4 punpcklwd xmm4, xmm6 punpckhwd xmm5, xmm6 movdqa xmm6, xmm5 movdqu xmm5, [esp] // restore xmm5 movdqu [esp], xmm6 // backup xmm6 movdqa xmm6, xmm5 // use xmm6 as temp register. punpcklwd xmm5, xmm7 punpckhwd xmm6, xmm7 movdqa xmm7, xmm6 // Third round of bit swap. // Write to the destination pointer. movdqa xmm6, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 movdqa xmm4, xmm6 movdqu xmm6, [esp] // restore xmm6 movlpd qword ptr [edx], xmm0 movhpd qword ptr [ebx], xmm0 movlpd qword ptr [edx + esi], xmm4 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm4 lea ebx, [ebx + 2 * ebp] movdqa xmm0, xmm2 // use xmm0 as the temp register. punpckldq xmm2, xmm6 movlpd qword ptr [edx], xmm2 movhpd qword ptr [ebx], xmm2 punpckhdq xmm0, xmm6 movlpd qword ptr [edx + esi], xmm0 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] movdqa xmm0, xmm1 // use xmm0 as the temp register. punpckldq xmm1, xmm5 movlpd qword ptr [edx], xmm1 movhpd qword ptr [ebx], xmm1 punpckhdq xmm0, xmm5 movlpd qword ptr [edx + esi], xmm0 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] movdqa xmm0, xmm3 // use xmm0 as the temp register. punpckldq xmm3, xmm7 movlpd qword ptr [edx], xmm3 movhpd qword ptr [ebx], xmm3 punpckhdq xmm0, xmm7 sub ecx, 8 movlpd qword ptr [edx + esi], xmm0 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] jg convertloop mov esp, [esp + 16] pop ebp pop edi pop esi pop ebx ret } } #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif