.text

.globl	RC4
.type	RC4,@function
.align	16
RC4:	or	%rsi,%rsi
	jne	.Lentry
	.byte	0xF3,0xC3
.Lentry:
	push	%r12
	push	%r13
	push	%r14
	add	$8,%rdi
	movl	-8(%rdi),%r10d
	movl	-4(%rdi),%r11d
	cmpl	$-1,256(%rdi)
	je	.LRC4_CHAR_BEGIN
	test	$-8,%rsi
	jz	.Lloop1
.align	16
.Lloop8:
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	add	%r8b,%r11b
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	add	%r8b,%r9b
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	movb	(%rdi,%r9,4),%al
	add	%r8b,%r11b
	ror	$8,%rax
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	add	%r8b,%r9b
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	movb	(%rdi,%r9,4),%al
	add	%r8b,%r11b
	ror	$8,%rax
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	add	%r8b,%r9b
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	movb	(%rdi,%r9,4),%al
	add	%r8b,%r11b
	ror	$8,%rax
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	add	%r8b,%r9b
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	movb	(%rdi,%r9,4),%al
	add	%r8b,%r11b
	ror	$8,%rax
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	add	%r8b,%r9b
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	movb	(%rdi,%r9,4),%al
	add	%r8b,%r11b
	ror	$8,%rax
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	add	%r8b,%r9b
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	movb	(%rdi,%r9,4),%al
	add	%r8b,%r11b
	ror	$8,%rax
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	add	%r8b,%r9b
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	movb	(%rdi,%r9,4),%al
	add	%r8b,%r11b
	ror	$8,%rax
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	sub	$8,%rsi
	add	%r9b,%r8b
	movb	(%rdi,%r8,4),%al
	ror	$8,%rax
	add	$8,%rdx
	add	$8,%rcx

	xor	-8(%rdx),%rax
	mov	%rax,-8(%rcx)

	test	$-8,%rsi
	jnz	.Lloop8
	cmp	$0,%rsi
	jne	.Lloop1
.Lexit:
	movl	%r10d,-8(%rdi)
	movl	%r11d,-4(%rdi)
	pop	%r14
	pop	%r13
	pop	%r12
	.byte	0xF3,0xC3
.align	16
.Lloop1:
	movzb	(%rdx),%eax
	inc	%r10b
	movl	(%rdi,%r10,4),%r8d
	add	%r8b,%r11b
	movl	(%rdi,%r11,4),%r9d
	movl	%r8d,(%rdi,%r11,4)
	movl	%r9d,(%rdi,%r10,4)
	add	%r9b,%r8b
	movl	(%rdi,%r8,4),%r9d
	xor	%r9,%rax
	inc	%rdx
	movb	%al,(%rcx)
	inc	%rcx
	dec	%rsi
	jnz	.Lloop1
	jmp	.Lexit
	
.align	16
.LRC4_CHAR_BEGIN:
	add	$1, %r10b
	cmp	$8, %rsi
	jle 	.LRC4_CHAR_LOOP1
.LRC4_CHAR_LOOP8:
	mov	(%rdx),%r12
	sub	$8,%rsi
1:
	movzb	(%rdi,%r10),%r8d
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	movb	%r9b,(%rdi,%r10)
	movb	%r8b,(%rdi,%r11)
	add	$1, %r10b
	add	%r8b,%r9b
	movzb	(%rdi,%r9),%eax
	ror	$8, %eax
2:
	movzb	(%rdi,%r10),%r8d
	movb	%r8b, %r13b
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	movb	%r9b, (%rdi,%r10)
	add	$1, %r10b
	add	%r8b,%r9b
	cmp	%r10, %r11
	je	next3
3:
	movzb	(%rdi,%r10),%r8d
next3:
	movb	%r8b, %r14b
	movb	%r13b, (%rdi,%r11)
	movb	(%rdi,%r9),%al
	ror	$8, %eax
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	movb	%r9b,(%rdi,%r10)
	add	$1, %r10b
	add	%r8b,%r9b
	cmp	%r10, %r11
	je	next4
4:
	movzb	(%rdi,%r10),%r8d
next4:	
	movb	%r8b, %r13b
	movb	%r14b,(%rdi,%r11)
	movb	(%rdi,%r9),%al
	ror	$8, %eax
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	movb	%r9b,(%rdi,%r10)
	add	$1, %r10b
	add	%r8b,%r9b
	cmp	%r10, %r11
	je	next5
5:
	movzb	(%rdi,%r10),%r8d
next5:
	movb	%r13b,(%rdi,%r11)
	movb	(%rdi,%r9),%al
	ror	$8, %eax
	xor	%r12d, %eax
	ror	$32, %r12
	mov	%eax,(%rcx)
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	movb	%r9b,(%rdi,%r10)
	movb	%r8b,(%rdi,%r11)
	add	$1, %r10b
	add	%r8b,%r9b
	movzb	(%rdi,%r9),%eax
	ror	$8, %eax
6:
	movzb	(%rdi,%r10),%r8d
	movb	%r8b, %r13b
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	movb	%r9b,(%rdi,%r10)
	add	$1, %r10b
	add	%r8b,%r9b
	cmp	%r10, %r11
	je	next7
7:
	movzb	(%rdi,%r10),%r8d
next7:	
	movb	%r8b, %r14b
	movb	%r13b,(%rdi,%r11)
	movb	(%rdi,%r9),%al
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	ror	$8, %eax
	movb	%r9b, (%rdi,%r10)
	add	$1, %r10b
	add	%r8b,%r9b
	cmp	%r10, %r11
	je	next8
8:
	movzb	(%rdi,%r10),%r8d
next8:
	movb	%r14b,(%rdi,%r11)
	movb	(%rdi,%r9),%al
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	movb	%r9b, (%rdi,%r10)
	ror	$8, %eax
	movb	%r8b, (%rdi,%r11)
	add	$1, %r10b
	add	%r8b,%r9b
	movb	(%rdi,%r9),%al
	ror	$8, %eax
	xor	%r12d, %eax
	
	mov	%eax,4(%rcx)
	add	$8, %rcx
	add	$8, %rdx
	test    $-8,%rsi
	jne	.LRC4_CHAR_LOOP8
	cmp	$0,%rsi
	jnz	.LRC4_CHAR_LOOP1
	sub	$1, %r10b
	jmp	.Lexit

.LRC4_CHAR_LOOP1:
	movzb	(%rdi,%r10),%r8d
	add	%r8b,%r11b
	movzb	(%rdi,%r11),%r9d
	movb	%r8b,(%rdi,%r11)
	movb	%r9b,(%rdi,%r10)
	add	%r8b,%r9b
	movzb	(%rdi,%r9),%r9d
	add	$1, %r10b
	xorb	(%rdx),%r9b
	movb	%r9b,(%rcx)
	add	$1, %rdx
	add	$1, %rcx
	sub	$1,%rsi
	jnz	.LRC4_CHAR_LOOP1
	sub	$1, %r10b
	jmp	.Lexit
.size	RC4,.-RC4
