auth/vendor/github.com/segmentio/asm/base64/decode_amd64.s

144 lines
4.1 KiB
ArmAsm

// Code generated by command: go run decode_asm.go -pkg base64 -out ../base64/decode_amd64.s -stubs ../base64/decode_amd64.go. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
DATA b64_dec_lut_hi<>+0(SB)/8, $0x0804080402011010
DATA b64_dec_lut_hi<>+8(SB)/8, $0x1010101010101010
DATA b64_dec_lut_hi<>+16(SB)/8, $0x0804080402011010
DATA b64_dec_lut_hi<>+24(SB)/8, $0x1010101010101010
GLOBL b64_dec_lut_hi<>(SB), RODATA|NOPTR, $32
DATA b64_dec_madd1<>+0(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+8(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+16(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+24(SB)/8, $0x0140014001400140
GLOBL b64_dec_madd1<>(SB), RODATA|NOPTR, $32
DATA b64_dec_madd2<>+0(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+8(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+16(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+24(SB)/8, $0x0001100000011000
GLOBL b64_dec_madd2<>(SB), RODATA|NOPTR, $32
DATA b64_dec_shuf_lo<>+0(SB)/8, $0x0000000000000000
DATA b64_dec_shuf_lo<>+8(SB)/8, $0x0600010200000000
GLOBL b64_dec_shuf_lo<>(SB), RODATA|NOPTR, $16
DATA b64_dec_shuf<>+0(SB)/8, $0x090a040506000102
DATA b64_dec_shuf<>+8(SB)/8, $0x000000000c0d0e08
DATA b64_dec_shuf<>+16(SB)/8, $0x0c0d0e08090a0405
DATA b64_dec_shuf<>+24(SB)/8, $0x0000000000000000
GLOBL b64_dec_shuf<>(SB), RODATA|NOPTR, $32
// func decodeAVX2(dst []byte, src []byte, lut *int8) (int, int)
// Requires: AVX, AVX2, SSE4.1
TEXT ·decodeAVX2(SB), NOSPLIT, $0-72
MOVQ dst_base+0(FP), AX
MOVQ src_base+24(FP), DX
MOVQ lut+48(FP), SI
MOVQ src_len+32(FP), DI
MOVB $0x2f, CL
PINSRB $0x00, CX, X8
VPBROADCASTB X8, Y8
XORQ CX, CX
XORQ BX, BX
VPXOR Y7, Y7, Y7
VPERMQ $0x44, (SI), Y6
VPERMQ $0x44, 16(SI), Y4
VMOVDQA b64_dec_lut_hi<>+0(SB), Y5
loop:
VMOVDQU (DX)(BX*1), Y0
VPSRLD $0x04, Y0, Y2
VPAND Y8, Y0, Y3
VPSHUFB Y3, Y4, Y3
VPAND Y8, Y2, Y2
VPSHUFB Y2, Y5, Y9
VPTEST Y9, Y3
JNE done
VPCMPEQB Y8, Y0, Y3
VPADDB Y3, Y2, Y2
VPSHUFB Y2, Y6, Y2
VPADDB Y0, Y2, Y0
VPMADDUBSW b64_dec_madd1<>+0(SB), Y0, Y0
VPMADDWD b64_dec_madd2<>+0(SB), Y0, Y0
VEXTRACTI128 $0x01, Y0, X1
VPSHUFB b64_dec_shuf_lo<>+0(SB), X1, X1
VPSHUFB b64_dec_shuf<>+0(SB), Y0, Y0
VPBLENDD $0x08, Y1, Y0, Y1
VPBLENDD $0xc0, Y7, Y1, Y1
VMOVDQU Y1, (AX)(CX*1)
ADDQ $0x18, CX
ADDQ $0x20, BX
SUBQ $0x20, DI
CMPQ DI, $0x2d
JB done
JMP loop
done:
MOVQ CX, ret+56(FP)
MOVQ BX, ret1+64(FP)
VZEROUPPER
RET
// func decodeAVX2URI(dst []byte, src []byte, lut *int8) (int, int)
// Requires: AVX, AVX2, SSE4.1
TEXT ·decodeAVX2URI(SB), NOSPLIT, $0-72
MOVB $0x2f, AL
PINSRB $0x00, AX, X0
VPBROADCASTB X0, Y0
MOVB $0x5f, AL
PINSRB $0x00, AX, X1
VPBROADCASTB X1, Y1
MOVQ dst_base+0(FP), AX
MOVQ src_base+24(FP), DX
MOVQ lut+48(FP), SI
MOVQ src_len+32(FP), DI
MOVB $0x2f, CL
PINSRB $0x00, CX, X10
VPBROADCASTB X10, Y10
XORQ CX, CX
XORQ BX, BX
VPXOR Y9, Y9, Y9
VPERMQ $0x44, (SI), Y8
VPERMQ $0x44, 16(SI), Y6
VMOVDQA b64_dec_lut_hi<>+0(SB), Y7
loop:
VMOVDQU (DX)(BX*1), Y2
VPCMPEQB Y2, Y1, Y4
VPBLENDVB Y4, Y0, Y2, Y2
VPSRLD $0x04, Y2, Y4
VPAND Y10, Y2, Y5
VPSHUFB Y5, Y6, Y5
VPAND Y10, Y4, Y4
VPSHUFB Y4, Y7, Y11
VPTEST Y11, Y5
JNE done
VPCMPEQB Y10, Y2, Y5
VPADDB Y5, Y4, Y4
VPSHUFB Y4, Y8, Y4
VPADDB Y2, Y4, Y2
VPMADDUBSW b64_dec_madd1<>+0(SB), Y2, Y2
VPMADDWD b64_dec_madd2<>+0(SB), Y2, Y2
VEXTRACTI128 $0x01, Y2, X3
VPSHUFB b64_dec_shuf_lo<>+0(SB), X3, X3
VPSHUFB b64_dec_shuf<>+0(SB), Y2, Y2
VPBLENDD $0x08, Y3, Y2, Y3
VPBLENDD $0xc0, Y9, Y3, Y3
VMOVDQU Y3, (AX)(CX*1)
ADDQ $0x18, CX
ADDQ $0x20, BX
SUBQ $0x20, DI
CMPQ DI, $0x2d
JB done
JMP loop
done:
MOVQ CX, ret+56(FP)
MOVQ BX, ret1+64(FP)
VZEROUPPER
RET