[asmjit] ; MultiplyMatrixVectorArrayF32
[asmjit] vbroadcastf128 ymm0, [rsi] ; C4E27D1A06 | vbroadcastf128 matrix[0], [mat] .r..x... .... ........
[asmjit] vbroadcastf128 ymm1, [rsi+0x10] ; C4E27D1A4E10 | vbroadcastf128 matrix[1], [mat+16] .r...x.. .... ........
[asmjit] vbroadcastf128 ymm2, [rsi+0x20] ; C4E27D1A5620 | vbroadcastf128 matrix[2], [mat+32] .r....x. .... ........
[asmjit] vbroadcastf128 ymm3, [rsi+0x30] ; C4E27D1A5E30 | vbroadcastf128 matrix[3], [mat+48] .R.....x .... ........
[asmjit] mov rsi, rdx ; 488BF2 | [Move] src
[asmjit] test rcx, rcx ; 4885C9 | test count, count . .r.... .... ........
[asmjit] jz L2 ; 0F84........ | jz L2 . ...... .... ........
[asmjit] cmp rcx, 6 ; 4883F906 | cmp count, 6 . .r.... .... ........
[asmjit] jl L5 ; 0F8C........ | jl L5 . ...... .... ........
[asmjit] mov rax, 10 ; 48C7C010000000 | mov alignHelper, 16 . ......w.... ........
[asmjit] and rax, rdi ; 4823C7 | and alignHelper, dst r ......x.... ........
[asmjit] jz L3 ; 0F84........ | jz L3 . ........... ........
[asmjit] add rdi, rax ; 4803F8 | add dst, alignHelper x ......r.... ........
[asmjit] add rsi, rax ; 4803F0 | add src, alignHelper . x.....r.... ........
[asmjit] neg rax ; 48F7D8 | neg alignHelper . ......x.... ........
[asmjit] vmovaps xmm7, [rsi+rax] ; C5F8283C06 | vmovaps v1-2.w, [src+alignHelper] . r.....r...x ........
[asmjit] vshufps xmm4, xmm7, xmm7, 0 ; C5C0C6E700 | vshufps v1-2.x, v1-2.w, v1-2.w, 0 . .......x..r ........
[asmjit] vmulps xmm4, xmm4, xmm0 ; C5D859E0 | vmulps v1-2.x, v1-2.x, matrix[0] . ..r....x... ........
[asmjit] vshufps xmm5, xmm7, xmm7, 55 ; C5C0C6EF55 | vshufps v1-2.y, v1-2.w, v1-2.w, 85 . ........x.r ........
[asmjit] vfmaddps xmm4, xmm5, xmm1, xmm4 ; C4E35168E140 | vfmaddps v1-2.x, v1-2.y, matrix[1], v1-2.x . ...r...xr.. ........
[asmjit] vshufps xmm6, xmm7, xmm7, AA ; C5C0C6F7AA | vshufps v1-2.z, v1-2.w, v1-2.w, 170 . .........xr ........
[asmjit] vfmaddps xmm4, xmm6, xmm2, xmm4 ; C4E34968E240 | vfmaddps v1-2.x, v1-2.z, matrix[2], v1-2.x . ....r..x.r. ........
[asmjit] vshufps xmm7, xmm7, xmm7, FF ; C5C0C6FFFF | vshufps v1-2.w, v1-2.w, v1-2.w, 255 . ..........x ........
[asmjit] vfmaddps xmm4, xmm7, xmm3, xmm4 ; C4E34168E340 | vfmaddps v1-2.x, v1-2.w, matrix[3], v1-2.x . .....r.x..r ........
[asmjit] vmovaps [rdi+rax], xmm4 ; C5F8292407 | vmovaps [dst+alignHelper], v1-2.x r ......rr... ........
[asmjit] sub rcx, 1 ; 4883E901 | sub count, 1 . .x.... .... ........
[asmjit] L3: ; | . ...... .... ........
[asmjit] mov rax, 2AAAAAAAAAAAAAAB ; 48B8ABAAAAAAAAAAAA2A| mov cntLo, 3074457345618258603 . ...... ....w ........
[asmjit] mul rcx ; 48F7E1 | mul cntHi, cntLo, count . .r.... ....xw........
[asmjit] lea rdx, [rdx+rdx*2] ; 488D1452 | lea cntHi, [cntHi+cntHi*2] . ...... .....x........
[asmjit] shl rdx, 1 ; 48D1E2 | shl cntHi, 1 . ...... .....x........
[asmjit] sub rcx, rdx ; 482BCA | sub count, cntHi . .x.... .....r........
[asmjit] shl rdx, 4 ; 48C1E204 | shl cntHi, 4 . ...... .....x........
[asmjit] add rsi, rdx ; 4803F2 | add src, cntHi . x..... .....r........
[asmjit] add rdi, rdx ; 4803FA | add dst, cntHi x ...... .....r........
[asmjit] neg rdx ; 48F7DA | neg cntHi . ...... .....x........
[asmjit] .align 32
[asmjit] L4: ; | . ...... ..............
[asmjit] vmovups ymm7, [rsi+rdx] ; C5FC103C16 | vmovups v1-2.w, [src+cntHi] . r..... ...x.r........
[asmjit] vmovups ymm11, [rsi+rdx+0x20] ; C57C105C1620 | vmovups v3-4.w, [src+cntHi+32] . r..... .....r...x....
[asmjit] vmovups ymm15, [rsi+rdx+0x40] ; C57C107C1640 | vmovups v5-6.w, [src+cntHi+64] . r..... .....r.......x
[asmjit] vshufps ymm4, ymm7, ymm7, 0 ; C5C4C6E700 | vshufps v1-2.x, v1-2.w, v1-2.w, 0 . ...... x..r..........
[asmjit] vshufps ymm8, ymm11, ymm11, 0 ; C44124C6C300 | vshufps v3-4.x, v3-4.w, v3-4.w, 0 . ...... ......x..r....
[asmjit] vshufps ymm12, ymm15, ymm15, 0 ; C44104C6E700 | vshufps v5-6.x, v5-6.w, v5-6.w, 0 . ...... ..........x..r
[asmjit] vmulps ymm4, ymm4, ymm0 ; C5DC59E0 | vmulps v1-2.x, v1-2.x, matrix[0] . ..r... x.............
[asmjit] vmulps ymm8, ymm8, ymm0 ; C53C59C0 | vmulps v3-4.x, v3-4.x, matrix[0] . ..r... ......x.......
[asmjit] vmulps ymm12, ymm12, ymm0 ; C51C59E0 | vmulps v5-6.x, v5-6.x, matrix[0] . ..r... ..........x...
[asmjit] vshufps ymm5, ymm7, ymm7, 55 ; C5C4C6EF55 | vshufps v1-2.y, v1-2.w, v1-2.w, 85 . ...... .x.r..........
[asmjit] vshufps ymm9, ymm11, ymm11, 55 ; C44124C6CB55 | vshufps v3-4.y, v3-4.w, v3-4.w, 85 . ...... .......x.r....
[asmjit] vshufps ymm13, ymm15, ymm15, 55 ; C44104C6EF55 | vshufps v5-6.y, v5-6.w, v5-6.w, 85 . ...... ...........x.r
[asmjit] vfmaddps ymm4, ymm5, ymm1, ymm4 ; C4E35568E140 | vfmaddps v1-2.x, v1-2.y, matrix[1], v1-2.x . ...r.. xr............
[asmjit] vfmaddps ymm8, ymm9, ymm1, ymm8 ; C4633568C180 | vfmaddps v3-4.x, v3-4.y, matrix[1], v3-4.x . ...r.. ......xr......
[asmjit] vfmaddps ymm12, ymm13, ymm1, ymm12 ; C4631568E1C0 | vfmaddps v5-6.x, v5-6.y, matrix[1], v5-6.x . ...r.. ..........xr..
[asmjit] vshufps ymm6, ymm7, ymm7, AA ; C5C4C6F7AA | vshufps v1-2.z, v1-2.w, v1-2.w, 170 . ...... ..xr..........
[asmjit] vshufps ymm10, ymm11, ymm11, AA ; C44124C6D3AA | vshufps v3-4.z, v3-4.w, v3-4.w, 170 . ...... ........xr....
[asmjit] vshufps ymm14, ymm15, ymm15, AA ; C44104C6F7AA | vshufps v5-6.z, v5-6.w, v5-6.w, 170 . ...... ............xr
[asmjit] vfmaddps ymm4, ymm6, ymm2, ymm4 ; C4E34D68E240 | vfmaddps v1-2.x, v1-2.z, matrix[2], v1-2.x . ....r. x.r...........
[asmjit] vfmaddps ymm8, ymm10, ymm2, ymm8 ; C4632D68C280 | vfmaddps v3-4.x, v3-4.z, matrix[2], v3-4.x . ....r. ......x.r.....
[asmjit] vfmaddps ymm12, ymm14, ymm2, ymm12 ; C4630D68E2C0 | vfmaddps v5-6.x, v5-6.z, matrix[2], v5-6.x . ....r. ..........x.r.
[asmjit] vshufps ymm7, ymm7, ymm7, FF ; C5C4C6FFFF | vshufps v1-2.w, v1-2.w, v1-2.w, 255 . ...... ...x..........
[asmjit] vshufps ymm11, ymm11, ymm11, FF ; C44124C6DBFF | vshufps v3-4.w, v3-4.w, v3-4.w, 255 . ...... .........x....
[asmjit] vshufps ymm15, ymm15, ymm15, FF ; C44104C6FFFF | vshufps v5-6.w, v5-6.w, v5-6.w, 255 . ...... .............x
[asmjit] vfmaddps ymm4, ymm7, ymm3, ymm4 ; C4E34568E340 | vfmaddps v1-2.x, v1-2.w, matrix[3], v1-2.x . .....r x..r..........
[asmjit] vfmaddps ymm8, ymm11, ymm3, ymm8 ; C4632568C380 | vfmaddps v3-4.x, v3-4.w, matrix[3], v3-4.x . .....r ......x..r....
[asmjit] vfmaddps ymm12, ymm15, ymm3, ymm12 ; C4630568E3C0 | vfmaddps v5-6.x, v5-6.w, matrix[3], v5-6.x . .....r ..........x..r
[asmjit] vmovaps [rdi+rdx], ymm4 ; C5FC292417 | vmovaps [dst+cntHi], v1-2.x r ...... r....r........
[asmjit] vmovaps [rdi+rdx+0x20], ymm8 ; C57C29441720 | vmovaps [dst+cntHi+32], v3-4.x r ...... .....rr.......
[asmjit] vmovaps [rdi+rdx+0x40], ymm12 ; C57C29641740 | vmovaps [dst+cntHi+64], v5-6.x r ...... .....r....r...
[asmjit] add rdx, 60 ; 4883C260 | add cntHi, 96 . ...... .....x........
[asmjit] jnz L4 ; 0F854EFFFFFF | jnz L4 . ...... ..............
[asmjit] test rcx, rcx ; 4885C9 | test count, count . .r.... ....
[asmjit] jz L2 ; 0F84........ | jz L2 . ...... ....
[asmjit] L5: ; | . ...... ....
[asmjit] shl rcx, 4 ; 48C1E104 | shl count, 4 . .x.... ....
[asmjit] add rdi, rcx ; 4803F9 | add dst, count x .r.... ....
[asmjit] add rsi, rcx ; 4803F1 | add src, count . xr.... ....
[asmjit] neg rcx ; 48F7D9 | neg count . .x.... ....
[asmjit] .align 32
[asmjit] L6: ; | . ...... ....
[asmjit] vmovaps xmm7, [rsi+rcx] ; C5F8283C0E | vmovaps v1-2.w, [src+count] . rr.... ...x
[asmjit] vshufps xmm4, xmm7, xmm7, 0 ; C5C0C6E700 | vshufps v1-2.x, v1-2.w, v1-2.w, 0 . ...... x..r
[asmjit] vmulps xmm4, xmm4, xmm0 ; C5D859E0 | vmulps v1-2.x, v1-2.x, matrix[0] . ..r... x...
[asmjit] vshufps xmm5, xmm7, xmm7, 55 ; C5C0C6EF55 | vshufps v1-2.y, v1-2.w, v1-2.w, 85 . ...... .x.r
[asmjit] vfmaddps xmm4, xmm5, xmm1, xmm4 ; C4E35168E140 | vfmaddps v1-2.x, v1-2.y, matrix[1], v1-2.x . ...r.. xr..
[asmjit] vshufps xmm6, xmm7, xmm7, AA ; C5C0C6F7AA | vshufps v1-2.z, v1-2.w, v1-2.w, 170 . ...... ..xr
[asmjit] vfmaddps xmm4, xmm6, xmm2, xmm4 ; C4E34968E240 | vfmaddps v1-2.x, v1-2.z, matrix[2], v1-2.x . ....r. x.r.
[asmjit] vshufps xmm7, xmm7, xmm7, FF ; C5C0C6FFFF | vshufps v1-2.w, v1-2.w, v1-2.w, 255 . ...... ...x
[asmjit] vfmaddps xmm4, xmm7, xmm3, xmm4 ; C4E34168E340 | vfmaddps v1-2.x, v1-2.w, matrix[3], v1-2.x . .....r x..r
[asmjit] vmovaps [rdi+rcx], xmm4 ; C5F829240F | vmovaps [dst+count], v1-2.x r .r.... r...
[asmjit] add rcx, 10 ; 4883C110 | add count, 16 . .x.... ....
[asmjit] short jnz L6 ; 75C6 | jnz L6 . ...... ....
[asmjit] L2: ; |
[asmjit] vzeroupper ; C5F877 | vzeroupper
[asmjit] L1: ; |
[asmjit] ret ; C3 |