How is this so fast? <C>

Gat Pelsinger · February 15

From a few days I was working on a program that benchmarks my implementation of string functions with the functions in the C library, but I had so many failed attempts for improving the modularity of the program, so then I said f**k it, am gonna crumble all the code in a single source file who cares? I mean who actually cares right? But now, I am getting my strlen function elapsed times way greater than the inbuilt strlen elapsed. If you remember a post I made earlier, in which I talked about how my strlen implementation is faster, I don't know what happened to it or I did something wrong.

#include <stdio.h>
#include <time.h>
#include <string.h>
#include <immintrin.h>

#define iteration 100

 unsigned long my_strlen1(const char *str)
{
    register unsigned long i = 0;
    for (; str[i]; i++)
        ;
    return i;
}

 unsigned long my_strlen2(const char *str)
{
    register const char *p = str;
    for (; *p; p++)
        ;
    return p - str;
}

 size_t strlen_avx2(const char *str) {
    const char *start = str;
    const __m256i zero = _mm256_setzero_si256();
    const __m256i* ptr = (const __m256i*)str;

    // Prefetch the next cache line
    _mm_prefetch((const char *)(ptr + 2), _MM_HINT_T0);

    while (1) {
        __m256i chunk1 = _mm256_loadu_si256(ptr++);
        __m256i chunk2 = _mm256_loadu_si256(ptr++);

        // Check both chunks for null terminators
        unsigned int mask1 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk1, zero));
        unsigned int mask2 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk2, zero));

        if (mask1) {
            // Null terminator found in first chunk
            return (size_t)((char*)ptr - 2 - start + __builtin_ctz(mask1));
        }

        if (mask2) {
            // Null terminator found in second chunk
            return (size_t)((char*)ptr - 1 - start + __builtin_ctz(mask2));
        }

        // Prefetch the next cache line
        _mm_prefetch((const char *)(ptr + 2), _MM_HINT_T0);
    }
}

int main(void)
{
    const char* str = "Hello";
    struct timespec start, end;

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < iteration; i++)
    strlen(str);
    clock_gettime(CLOCK_MONOTONIC, &end);
    printf("strlen elapsed: %ld\n", end.tv_nsec - start.tv_nsec);

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < iteration; i++)
    my_strlen1(str);
    clock_gettime(CLOCK_MONOTONIC, &end);
    printf("my_strlen1 elapsed: %ld\n", end.tv_nsec - start.tv_nsec);

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < iteration; i++)
    my_strlen2(str);
    clock_gettime(CLOCK_MONOTONIC, &end);
    printf("my_strlen2 elapsed: %ld\n", end.tv_nsec - start.tv_nsec);
    
    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < iteration; i++)
    strlen_avx2(str);
    clock_gettime(CLOCK_MONOTONIC, &end);
    printf("strlen_avx2 elapsed: %ld\n", end.tv_nsec - start.tv_nsec);

    return 0;
}

Now here, str is only "Hello", but in my program, it is repeated so many times that the size of the string is 100 KB. Nonetheless, you will should get the inbuilt strlen elapsed time way lower than others. Inlining my function won't do anything, and using higher optimization levels still keeps the ratio same. I don't know what I have done. How is strlen suddenly so fast? Or has it always been like this and that last program I made had some flaw?

Btw before you scream at me, here is the assembly. I am only putting it on you because I do not understand it, sorry.

	.file	"main.c"
	.text
	.def	printf;	.scl	3;	.type	32;	.endef
	.seh_proc	printf
printf:
	pushq	%rbp
	.seh_pushreg	%rbp
	pushq	%rbx
	.seh_pushreg	%rbx
	subq	$56, %rsp
	.seh_stackalloc	56
	leaq	48(%rsp), %rbp
	.seh_setframe	%rbp, 48
	.seh_endprologue
	movq	%rcx, 32(%rbp)
	movq	%rdx, 40(%rbp)
	movq	%r8, 48(%rbp)
	movq	%r9, 56(%rbp)
	leaq	40(%rbp), %rax
	movq	%rax, -16(%rbp)
	movq	-16(%rbp), %rbx
	movl	$1, %ecx
	movq	__imp___acrt_iob_func(%rip), %rax
	call	*%rax
	movq	%rax, %rcx
	movq	32(%rbp), %rax
	movq	%rbx, %r8
	movq	%rax, %rdx
	call	__mingw_vfprintf
	movl	%eax, -4(%rbp)
	movl	-4(%rbp), %eax
	addq	$56, %rsp
	popq	%rbx
	popq	%rbp
	ret
	.seh_endproc
	.globl	my_strlen1
	.def	my_strlen1;	.scl	2;	.type	32;	.endef
	.seh_proc	my_strlen1
my_strlen1:
	pushq	%rbp
	.seh_pushreg	%rbp
	pushq	%rbx
	.seh_pushreg	%rbx
	leaq	(%rsp), %rbp
	.seh_setframe	%rbp, 0
	.seh_endprologue
	movq	%rcx, 24(%rbp)
	movl	$0, %ebx
	jmp	.L4
.L5:
	incl	%ebx
.L4:
	movl	%ebx, %edx
	movq	24(%rbp), %rax
	addq	%rdx, %rax
	movzbl	(%rax), %eax
	testb	%al, %al
	jne	.L5
	movl	%ebx, %eax
	popq	%rbx
	popq	%rbp
	ret
	.seh_endproc
	.globl	my_strlen2
	.def	my_strlen2;	.scl	2;	.type	32;	.endef
	.seh_proc	my_strlen2
my_strlen2:
	pushq	%rbp
	.seh_pushreg	%rbp
	pushq	%rbx
	.seh_pushreg	%rbx
	leaq	(%rsp), %rbp
	.seh_setframe	%rbp, 0
	.seh_endprologue
	movq	%rcx, 24(%rbp)
	movq	24(%rbp), %rbx
	jmp	.L8
.L9:
	incq	%rbx
.L8:
	movzbl	(%rbx), %eax
	testb	%al, %al
	jne	.L9
	movq	%rbx, %rax
	subq	24(%rbp), %rax
	popq	%rbx
	popq	%rbp
	ret
	.seh_endproc
	.globl	strlen_avx2
	.def	strlen_avx2;	.scl	2;	.type	32;	.endef
	.seh_proc	strlen_avx2
strlen_avx2:
	pushq	%rbp
	.seh_pushreg	%rbp
	subq	$352, %rsp
	.seh_stackalloc	352
	leaq	128(%rsp), %rbp
	.seh_setframe	%rbp, 128
	.seh_endprologue
	movq	%rcx, 240(%rbp)
	leaq	224(%rbp), %rax
	subq	$352, %rax
	addq	$31, %rax
	shrq	$5, %rax
	salq	$5, %rax
	movq	240(%rbp), %rdx
	movq	%rdx, 208(%rbp)
	vpxor	%xmm0, %xmm0, %xmm0
	vmovdqa	%ymm0, 256(%rax)
	movq	240(%rbp), %rdx
	movq	%rdx, 216(%rbp)
	movq	216(%rbp), %rdx
	addq	$64, %rdx
	prefetcht0	(%rdx)
.L22:
	movq	216(%rbp), %rdx
	leaq	32(%rdx), %rcx
	movq	%rcx, 216(%rbp)
	movq	%rdx, 184(%rbp)
	movq	184(%rbp), %rdx
	vmovdqu	(%rdx), %ymm0
	vmovdqa	%ymm0, 224(%rax)
	movq	216(%rbp), %rdx
	leaq	32(%rdx), %rcx
	movq	%rcx, 216(%rbp)
	movq	%rdx, 192(%rbp)
	movq	192(%rbp), %rdx
	vmovdqu	(%rdx), %ymm0
	vmovdqa	%ymm0, 192(%rax)
	vmovdqa	224(%rax), %ymm0
	vmovdqa	%ymm0, 32(%rax)
	vmovdqa	256(%rax), %ymm0
	vmovdqa	%ymm0, (%rax)
	vmovdqa	32(%rax), %ymm1
	vmovdqa	(%rax), %ymm0
	vpcmpeqb	%ymm0, %ymm1, %ymm0
	vmovdqa	%ymm0, %ymm1
	vmovdqa	%ymm1, %ymm0
	vmovdqa	%ymm0, 64(%rax)
	vmovdqa	64(%rax), %ymm0
	vpmovmskb	%ymm0, %edx
	movl	%edx, 204(%rbp)
	vmovdqa	192(%rax), %ymm0
	vmovdqa	%ymm0, 128(%rax)
	vmovdqa	256(%rax), %ymm0
	vmovdqa	%ymm0, 96(%rax)
	vmovdqa	128(%rax), %ymm1
	vmovdqa	96(%rax), %ymm0
	vpcmpeqb	%ymm0, %ymm1, %ymm0
	vmovdqa	%ymm0, %ymm1
	vmovdqa	%ymm1, %ymm0
	vmovdqa	%ymm0, 160(%rax)
	vmovdqa	160(%rax), %ymm0
	vpmovmskb	%ymm0, %edx
	movl	%edx, 200(%rbp)
	cmpl	$0, 204(%rbp)
	je	.L19
	movq	216(%rbp), %rax
	subq	$2, %rax
	subq	208(%rbp), %rax
	movq	%rax, %rdx
	xorl	%eax, %eax
	tzcntl	204(%rbp), %eax
	cltq
	addq	%rdx, %rax
	jmp	.L20
.L19:
	cmpl	$0, 200(%rbp)
	je	.L21
	movq	216(%rbp), %rax
	decq	%rax
	subq	208(%rbp), %rax
	movq	%rax, %rdx
	xorl	%eax, %eax
	tzcntl	200(%rbp), %eax
	cltq
	addq	%rdx, %rax
	jmp	.L20
.L21:
	movq	216(%rbp), %rdx
	addq	$64, %rdx
	prefetcht0	(%rdx)
	jmp	.L22
.L20:
	addq	$352, %rsp
	popq	%rbp
	ret
	.seh_endproc
	.def	__main;	.scl	2;	.type	32;	.endef
	.section .rdata,"dr"
.LC0:
	.ascii "Hello\0"
.LC1:
	.ascii "strlen elapsed: %ld\12\0"
.LC2:
	.ascii "my_strlen1 elapsed: %ld\12\0"
.LC3:
	.ascii "my_strlen2 elapsed: %ld\12\0"
.LC4:
	.ascii "strlen_avx2 elapsed: %ld\12\0"
	.text
	.globl	main
	.def	main;	.scl	2;	.type	32;	.endef
	.seh_proc	main
main:
	pushq	%rbp
	.seh_pushreg	%rbp
	movq	%rsp, %rbp
	.seh_setframe	%rbp, 0
	subq	$96, %rsp
	.seh_stackalloc	96
	.seh_endprologue
	call	__main
	leaq	.LC0(%rip), %rax
	movq	%rax, -24(%rbp)
	leaq	-48(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime
	movl	$0, -4(%rbp)
	jmp	.L24
.L25:
	incl	-4(%rbp)
.L24:
	cmpl	$99, -4(%rbp)
	jle	.L25
	leaq	-64(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime
	movl	-56(%rbp), %eax
	movl	-40(%rbp), %edx
	subl	%edx, %eax
	movl	%eax, %edx
	leaq	.LC1(%rip), %rax
	movq	%rax, %rcx
	call	printf
	leaq	-48(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime
	movl	$0, -8(%rbp)
	jmp	.L26
.L27:
	movq	-24(%rbp), %rax
	movq	%rax, %rcx
	call	my_strlen1
	incl	-8(%rbp)
.L26:
	cmpl	$99, -8(%rbp)
	jle	.L27
	leaq	-64(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime
	movl	-56(%rbp), %eax
	movl	-40(%rbp), %edx
	subl	%edx, %eax
	movl	%eax, %edx
	leaq	.LC2(%rip), %rax
	movq	%rax, %rcx
	call	printf
	leaq	-48(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime
	movl	$0, -12(%rbp)
	jmp	.L28
.L29:
	movq	-24(%rbp), %rax
	movq	%rax, %rcx
	call	my_strlen2
	incl	-12(%rbp)
.L28:
	cmpl	$99, -12(%rbp)
	jle	.L29
	leaq	-64(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime
	movl	-56(%rbp), %eax
	movl	-40(%rbp), %edx
	subl	%edx, %eax
	movl	%eax, %edx
	leaq	.LC3(%rip), %rax
	movq	%rax, %rcx
	call	printf
	leaq	-48(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime
	movl	$0, -16(%rbp)
	jmp	.L30
.L31:
	movq	-24(%rbp), %rax
	movq	%rax, %rcx
	call	strlen_avx2
	incl	-16(%rbp)
.L30:
	cmpl	$99, -16(%rbp)
	jle	.L31
	leaq	-64(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime
	movl	-56(%rbp), %eax
	movl	-40(%rbp), %edx
	subl	%edx, %eax
	movl	%eax, %edx
	leaq	.LC4(%rip), %rax
	movq	%rax, %rcx
	call	printf
	movl	$0, %eax
	addq	$96, %rsp
	popq	%rbp
	ret
	.seh_endproc
	.ident	"GCC: (x86_64-win32-seh-rev0, Built by MinGW-Builds project) 13.2.0"
	.def	__mingw_vfprintf;	.scl	2;	.type	32;	.endef
	.def	clock_gettime;	.scl	2;	.type	32;	.endef

This is one with Intel syntax if that's helpful one anyway -

	.file	"main.c"
	.intel_syntax noprefix
	.text
	.def	printf;	.scl	3;	.type	32;	.endef
	.seh_proc	printf
printf:
	push	rbp
	.seh_pushreg	rbp
	push	rbx
	.seh_pushreg	rbx
	sub	rsp, 56
	.seh_stackalloc	56
	lea	rbp, 48[rsp]
	.seh_setframe	rbp, 48
	.seh_endprologue
	mov	QWORD PTR 32[rbp], rcx
	mov	QWORD PTR 40[rbp], rdx
	mov	QWORD PTR 48[rbp], r8
	mov	QWORD PTR 56[rbp], r9
	lea	rax, 40[rbp]
	mov	QWORD PTR -16[rbp], rax
	mov	rbx, QWORD PTR -16[rbp]
	mov	ecx, 1
	mov	rax, QWORD PTR __imp___acrt_iob_func[rip]
	call	rax
	mov	rcx, rax
	mov	rax, QWORD PTR 32[rbp]
	mov	r8, rbx
	mov	rdx, rax
	call	__mingw_vfprintf
	mov	DWORD PTR -4[rbp], eax
	mov	eax, DWORD PTR -4[rbp]
	add	rsp, 56
	pop	rbx
	pop	rbp
	ret
	.seh_endproc
	.globl	my_strlen1
	.def	my_strlen1;	.scl	2;	.type	32;	.endef
	.seh_proc	my_strlen1
my_strlen1:
	push	rbp
	.seh_pushreg	rbp
	push	rbx
	.seh_pushreg	rbx
	lea	rbp, [rsp]
	.seh_setframe	rbp, 0
	.seh_endprologue
	mov	QWORD PTR 24[rbp], rcx
	mov	ebx, 0
	jmp	.L4
.L5:
	inc	ebx
.L4:
	mov	edx, ebx
	mov	rax, QWORD PTR 24[rbp]
	add	rax, rdx
	movzx	eax, BYTE PTR [rax]
	test	al, al
	jne	.L5
	mov	eax, ebx
	pop	rbx
	pop	rbp
	ret
	.seh_endproc
	.globl	my_strlen2
	.def	my_strlen2;	.scl	2;	.type	32;	.endef
	.seh_proc	my_strlen2
my_strlen2:
	push	rbp
	.seh_pushreg	rbp
	push	rbx
	.seh_pushreg	rbx
	lea	rbp, [rsp]
	.seh_setframe	rbp, 0
	.seh_endprologue
	mov	QWORD PTR 24[rbp], rcx
	mov	rbx, QWORD PTR 24[rbp]
	jmp	.L8
.L9:
	inc	rbx
.L8:
	movzx	eax, BYTE PTR [rbx]
	test	al, al
	jne	.L9
	mov	rax, rbx
	sub	rax, QWORD PTR 24[rbp]
	pop	rbx
	pop	rbp
	ret
	.seh_endproc
	.globl	strlen_avx2
	.def	strlen_avx2;	.scl	2;	.type	32;	.endef
	.seh_proc	strlen_avx2
strlen_avx2:
	push	rbp
	.seh_pushreg	rbp
	sub	rsp, 352
	.seh_stackalloc	352
	lea	rbp, 128[rsp]
	.seh_setframe	rbp, 128
	.seh_endprologue
	mov	QWORD PTR 240[rbp], rcx
	lea	rax, 224[rbp]
	sub	rax, 352
	add	rax, 31
	shr	rax, 5
	sal	rax, 5
	mov	rdx, QWORD PTR 240[rbp]
	mov	QWORD PTR 208[rbp], rdx
	vpxor	xmm0, xmm0, xmm0
	vmovdqa	YMMWORD PTR 256[rax], ymm0
	mov	rdx, QWORD PTR 240[rbp]
	mov	QWORD PTR 216[rbp], rdx
	mov	rdx, QWORD PTR 216[rbp]
	add	rdx, 64
	prefetcht0	[rdx]
.L22:
	mov	rdx, QWORD PTR 216[rbp]
	lea	rcx, 32[rdx]
	mov	QWORD PTR 216[rbp], rcx
	mov	QWORD PTR 184[rbp], rdx
	mov	rdx, QWORD PTR 184[rbp]
	vmovdqu	ymm0, YMMWORD PTR [rdx]
	vmovdqa	YMMWORD PTR 224[rax], ymm0
	mov	rdx, QWORD PTR 216[rbp]
	lea	rcx, 32[rdx]
	mov	QWORD PTR 216[rbp], rcx
	mov	QWORD PTR 192[rbp], rdx
	mov	rdx, QWORD PTR 192[rbp]
	vmovdqu	ymm0, YMMWORD PTR [rdx]
	vmovdqa	YMMWORD PTR 192[rax], ymm0
	vmovdqa	ymm0, YMMWORD PTR 224[rax]
	vmovdqa	YMMWORD PTR 32[rax], ymm0
	vmovdqa	ymm0, YMMWORD PTR 256[rax]
	vmovdqa	YMMWORD PTR [rax], ymm0
	vmovdqa	ymm1, YMMWORD PTR 32[rax]
	vmovdqa	ymm0, YMMWORD PTR [rax]
	vpcmpeqb	ymm0, ymm1, ymm0
	vmovdqa	ymm1, ymm0
	vmovdqa	ymm0, ymm1
	vmovdqa	YMMWORD PTR 64[rax], ymm0
	vmovdqa	ymm0, YMMWORD PTR 64[rax]
	vpmovmskb	edx, ymm0
	mov	DWORD PTR 204[rbp], edx
	vmovdqa	ymm0, YMMWORD PTR 192[rax]
	vmovdqa	YMMWORD PTR 128[rax], ymm0
	vmovdqa	ymm0, YMMWORD PTR 256[rax]
	vmovdqa	YMMWORD PTR 96[rax], ymm0
	vmovdqa	ymm1, YMMWORD PTR 128[rax]
	vmovdqa	ymm0, YMMWORD PTR 96[rax]
	vpcmpeqb	ymm0, ymm1, ymm0
	vmovdqa	ymm1, ymm0
	vmovdqa	ymm0, ymm1
	vmovdqa	YMMWORD PTR 160[rax], ymm0
	vmovdqa	ymm0, YMMWORD PTR 160[rax]
	vpmovmskb	edx, ymm0
	mov	DWORD PTR 200[rbp], edx
	cmp	DWORD PTR 204[rbp], 0
	je	.L19
	mov	rax, QWORD PTR 216[rbp]
	sub	rax, 2
	sub	rax, QWORD PTR 208[rbp]
	mov	rdx, rax
	xor	eax, eax
	tzcnt	eax, DWORD PTR 204[rbp]
	cdqe
	add	rax, rdx
	jmp	.L20
.L19:
	cmp	DWORD PTR 200[rbp], 0
	je	.L21
	mov	rax, QWORD PTR 216[rbp]
	dec	rax
	sub	rax, QWORD PTR 208[rbp]
	mov	rdx, rax
	xor	eax, eax
	tzcnt	eax, DWORD PTR 200[rbp]
	cdqe
	add	rax, rdx
	jmp	.L20
.L21:
	mov	rdx, QWORD PTR 216[rbp]
	add	rdx, 64
	prefetcht0	[rdx]
	jmp	.L22
.L20:
	add	rsp, 352
	pop	rbp
	ret
	.seh_endproc
	.def	__main;	.scl	2;	.type	32;	.endef
	.section .rdata,"dr"
.LC0:
	.ascii "Hello\0"
.LC1:
	.ascii "strlen elapsed: %ld\12\0"
.LC2:
	.ascii "my_strlen1 elapsed: %ld\12\0"
.LC3:
	.ascii "my_strlen2 elapsed: %ld\12\0"
.LC4:
	.ascii "strlen_avx2 elapsed: %ld\12\0"
	.text
	.globl	main
	.def	main;	.scl	2;	.type	32;	.endef
	.seh_proc	main
main:
	push	rbp
	.seh_pushreg	rbp
	mov	rbp, rsp
	.seh_setframe	rbp, 0
	sub	rsp, 96
	.seh_stackalloc	96
	.seh_endprologue
	call	__main
	lea	rax, .LC0[rip]
	mov	QWORD PTR -24[rbp], rax
	lea	rax, -48[rbp]
	mov	rdx, rax
	mov	ecx, 1
	call	clock_gettime
	mov	DWORD PTR -4[rbp], 0
	jmp	.L24
.L25:
	inc	DWORD PTR -4[rbp]
.L24:
	cmp	DWORD PTR -4[rbp], 99
	jle	.L25
	lea	rax, -64[rbp]
	mov	rdx, rax
	mov	ecx, 1
	call	clock_gettime
	mov	eax, DWORD PTR -56[rbp]
	mov	edx, DWORD PTR -40[rbp]
	sub	eax, edx
	mov	edx, eax
	lea	rax, .LC1[rip]
	mov	rcx, rax
	call	printf
	lea	rax, -48[rbp]
	mov	rdx, rax
	mov	ecx, 1
	call	clock_gettime
	mov	DWORD PTR -8[rbp], 0
	jmp	.L26
.L27:
	mov	rax, QWORD PTR -24[rbp]
	mov	rcx, rax
	call	my_strlen1
	inc	DWORD PTR -8[rbp]
.L26:
	cmp	DWORD PTR -8[rbp], 99
	jle	.L27
	lea	rax, -64[rbp]
	mov	rdx, rax
	mov	ecx, 1
	call	clock_gettime
	mov	eax, DWORD PTR -56[rbp]
	mov	edx, DWORD PTR -40[rbp]
	sub	eax, edx
	mov	edx, eax
	lea	rax, .LC2[rip]
	mov	rcx, rax
	call	printf
	lea	rax, -48[rbp]
	mov	rdx, rax
	mov	ecx, 1
	call	clock_gettime
	mov	DWORD PTR -12[rbp], 0
	jmp	.L28
.L29:
	mov	rax, QWORD PTR -24[rbp]
	mov	rcx, rax
	call	my_strlen2
	inc	DWORD PTR -12[rbp]
.L28:
	cmp	DWORD PTR -12[rbp], 99
	jle	.L29
	lea	rax, -64[rbp]
	mov	rdx, rax
	mov	ecx, 1
	call	clock_gettime
	mov	eax, DWORD PTR -56[rbp]
	mov	edx, DWORD PTR -40[rbp]
	sub	eax, edx
	mov	edx, eax
	lea	rax, .LC3[rip]
	mov	rcx, rax
	call	printf
	lea	rax, -48[rbp]
	mov	rdx, rax
	mov	ecx, 1
	call	clock_gettime
	mov	DWORD PTR -16[rbp], 0
	jmp	.L30
.L31:
	mov	rax, QWORD PTR -24[rbp]
	mov	rcx, rax
	call	strlen_avx2
	inc	DWORD PTR -16[rbp]
.L30:
	cmp	DWORD PTR -16[rbp], 99
	jle	.L31
	lea	rax, -64[rbp]
	mov	rdx, rax
	mov	ecx, 1
	call	clock_gettime
	mov	eax, DWORD PTR -56[rbp]
	mov	edx, DWORD PTR -40[rbp]
	sub	eax, edx
	mov	edx, eax
	lea	rax, .LC4[rip]
	mov	rcx, rax
	call	printf
	mov	eax, 0
	add	rsp, 96
	pop	rbp
	ret
	.seh_endproc
	.ident	"GCC: (x86_64-win32-seh-rev0, Built by MinGW-Builds project) 13.2.0"
	.def	__mingw_vfprintf;	.scl	2;	.type	32;	.endef
	.def	clock_gettime;	.scl	2;	.type	32;	.endef

dcgreen2k · February 15

The test for the builtin strlen function runs so fast because the function call gets optimized away. The compiler knows you aren't doing anything useful with strlen's return value, so it's free to do whatever it likes in this case.

This code in the C source file:

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < iteration; i++)
        strlen(str);
    clock_gettime(CLOCK_MONOTONIC, &end);

corresponds to this code in the assembly output, the first loop after main begins:

	call	clock_gettime
	movl	$0, -4(%rbp)
	jmp	.L24
.L25:
	incl	-4(%rbp)
.L24:
	cmpl	$99, -4(%rbp)
	jle	.L25
	leaq	-64(%rbp), %rax
	movq	%rax, %rdx
	movl	$1, %ecx
	call	clock_gettime

The for loop corresponds to the instruction after .L25 and the two after .L24. As you can see, there's no call to strlen in there. To fix this, you need to find a way to prevent the compiler from optimizing away any function calls. One way to do this, as we've seen in previous posts, is to take a pointer to the function you want to test and pass it to another function that runs the benchmark. Here's an example I adapted from your code above as well as @Eigenvektor's previous benchmarking code.

Spoiler

#include <stdio.h>
#include <time.h>
#include <string.h>

#define iteration 1000000

char* str = "Put long string here";

void benchmark(size_t (*functionPointer)(const char*)) {
    struct timespec start, end;

    clock_gettime(CLOCK_MONOTONIC, &start);
    for (int i = 0; i < iteration; i++) {
        functionPointer(str);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    printf("elapsed: %ld\n", end.tv_nsec - start.tv_nsec);
}

int main() {
    benchmark(&strlen);
    return 0;
}

Eigenvektor · February 15

Spoiler

#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <immintrin.h>

#define CHARLEN 10000
#define LOOPCOUNT 10000

size_t mystrlen(const char* str)
{
    size_t i = 0;

    while(str[i]) ++i;

    return i;
}

size_t kstrlen(const char *s)
{
    const char *sc = s;

    while(*sc) ++sc;

    return sc - s;
}

 size_t strlen_avx2(const char *str) {
    const char *start = str;
    const __m256i zero = _mm256_setzero_si256();
    const __m256i* ptr = (const __m256i*)str;

    while (1) {
        // Prefetch the next cache line
        _mm_prefetch((const char *)(ptr + 2), _MM_HINT_T0);

        __m256i chunk1 = _mm256_loadu_si256(ptr++);
        __m256i chunk2 = _mm256_loadu_si256(ptr++);

        // Check both chunks for null terminators
        unsigned int mask1 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk1, zero));
        unsigned int mask2 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk2, zero));

        if (mask1) {
            // Null terminator found in first chunk
            return (size_t)((char*)ptr - 2 - start + __builtin_ctz(mask1));
        }

        if (mask2) {
            // Null terminator found in second chunk
            return (size_t)((char*)ptr - 1 - start + __builtin_ctz(mask2));
        }
    }
}

void str_gen(char *dest, size_t length, int offset)
{
    char charset[] = "0123456789"
                     "abcdefghijklmnopqrstuvwxyz"
                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
    size_t len = sizeof charset - 1;
    size_t index = offset % len;

    while (length-- > 0) {
        *dest++ = charset[index++];
        index %= len;
    }
    *dest = '\0';
}

void measure(char* name, int run, size_t (*funPtr)(const char*)) {
    unsigned long total = 0;
    unsigned long count;

    struct timespec start;
    struct timespec end;

    char str[] = { [CHARLEN] = '\0' };
    str_gen(str, CHARLEN, 0);

    clock_gettime(CLOCK_MONOTONIC, &start);

    for (int i = 0; i < LOOPCOUNT; i++) {
        total += funPtr(str);
    }

    clock_gettime(CLOCK_MONOTONIC, &end);

    long elapsed = (end.tv_nsec - start.tv_nsec);
    printf("%15s #%d, count %lu, took: %10li ns\n", name, run, total, elapsed);
}

int main(void)
{
    for(int run = 0; run < 3; ++run)
    {
        measure("strlen", run, &strlen);
        measure("kstrlen", run, &kstrlen);
        measure("mystrlen", run, &mystrlen);
        measure("strlen_avx2", run, &strlen_avx2);
    }

    return 0;
}

The AVX method appears to contain a bug. The string length is greater than expected.

         strlen #0, count 100000000, took:     808439 ns
        kstrlen #0, count 100000000, took:   42467828 ns
       mystrlen #0, count 100000000, took:     632631 ns
    strlen_avx2 #0, count 100620000, took:    1840325 ns
         strlen #1, count 100000000, took:     648831 ns
        kstrlen #1, count 100000000, took:   29775410 ns
       mystrlen #1, count 100000000, took:     383364 ns
    strlen_avx2 #1, count 100620000, took:    1112645 ns
         strlen #2, count 100000000, took:     380785 ns
        kstrlen #2, count 100000000, took:   21637018 ns
       mystrlen #2, count 100000000, took:     378825 ns
    strlen_avx2 #2, count 100620000, took:    1116355 ns

Gat Pelsinger · February 15

3 hours ago, Eigenvektor said:

Reveal hidden contents

#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <immintrin.h>

#define CHARLEN 10000
#define LOOPCOUNT 10000

size_t mystrlen(const char* str)
{
    size_t i = 0;

    while(str[i]) ++i;

    return i;
}

size_t kstrlen(const char *s)
{
    const char *sc = s;

    while(*sc) ++sc;

    return sc - s;
}

 size_t strlen_avx2(const char *str) {
    const char *start = str;
    const __m256i zero = _mm256_setzero_si256();
    const __m256i* ptr = (const __m256i*)str;

    while (1) {
        // Prefetch the next cache line
        _mm_prefetch((const char *)(ptr + 2), _MM_HINT_T0);

        __m256i chunk1 = _mm256_loadu_si256(ptr++);
        __m256i chunk2 = _mm256_loadu_si256(ptr++);

        // Check both chunks for null terminators
        unsigned int mask1 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk1, zero));
        unsigned int mask2 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk2, zero));

        if (mask1) {
            // Null terminator found in first chunk
            return (size_t)((char*)ptr - 2 - start + __builtin_ctz(mask1));
        }

        if (mask2) {
            // Null terminator found in second chunk
            return (size_t)((char*)ptr - 1 - start + __builtin_ctz(mask2));
        }
    }
}

void str_gen(char *dest, size_t length, int offset)
{
    char charset[] = "0123456789"
                     "abcdefghijklmnopqrstuvwxyz"
                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
    size_t len = sizeof charset - 1;
    size_t index = offset % len;

    while (length-- > 0) {
        *dest++ = charset[index++];
        index %= len;
    }
    *dest = '\0';
}

void measure(char* name, int run, size_t (*funPtr)(const char*)) {
    unsigned long total = 0;
    unsigned long count;

    struct timespec start;
    struct timespec end;

    char str[] = { [CHARLEN] = '\0' };
    str_gen(str, CHARLEN, 0);

    clock_gettime(CLOCK_MONOTONIC, &start);

    for (int i = 0; i < LOOPCOUNT; i++) {
        total += funPtr(str);
    }

    clock_gettime(CLOCK_MONOTONIC, &end);

    long elapsed = (end.tv_nsec - start.tv_nsec);
    printf("%15s #%d, count %lu, took: %10li ns\n", name, run, total, elapsed);
}

int main(void)
{
    for(int run = 0; run < 3; ++run)
    {
        measure("strlen", run, &strlen);
        measure("kstrlen", run, &kstrlen);
        measure("mystrlen", run, &mystrlen);
        measure("strlen_avx2", run, &strlen_avx2);
    }

    return 0;
}

The AVX method appears to contain a bug. The string length is greater than expected.

         strlen #0, count 100000000, took:     808439 ns
        kstrlen #0, count 100000000, took:   42467828 ns
       mystrlen #0, count 100000000, took:     632631 ns
    strlen_avx2 #0, count 100620000, took:    1840325 ns
         strlen #1, count 100000000, took:     648831 ns
        kstrlen #1, count 100000000, took:   29775410 ns
       mystrlen #1, count 100000000, took:     383364 ns
    strlen_avx2 #1, count 100620000, took:    1112645 ns
         strlen #2, count 100000000, took:     380785 ns
        kstrlen #2, count 100000000, took:   21637018 ns
       mystrlen #2, count 100000000, took:     378825 ns
    strlen_avx2 #2, count 100620000, took:    1116355 ns

It's AI generated, what do you expect ?

Gat Pelsinger · February 15

@dcgreen2k @Eigenvektor

So it looks like the inbuilt strlen is actually faster. I was here battling the C standard itself to prove my functions are faster, but I guess not. Is there a way I can view the actual strlen function that is being called in my program? I can view libc online, but one of my strlen functions is literally a copy of that function, still it is slow, so it is not the same function that is being called. I want to see what secret sauce strlen is using. Is it possible to extract that from the assembly file? I couldn't quite see any strlen related code, just a call to it.

dcgreen2k · February 16

5 hours ago, Gat Pelsinger said:

So it looks like the inbuilt strlen is actually faster. I was here battling the C standard itself to prove my functions are faster, but I guess not. Is there a way I can view the actual strlen function that is being called in my program? I can view libc online, but one of my strlen functions is literally a copy of that function, still it is slow, so it is not the same function that is being called. I want to see what secret sauce strlen is using. Is it possible to extract that from the assembly file? I couldn't quite see any strlen related code, just a call to it.

Yes. I was able to figure out what code the builtin strlen was using with GDB and some searching on the internet. GDB is a debugger that lets you view the program's assembly code while you step through it. When I stepped into the call to strlen, GDB reported this:

This means the code actually called a function named __strlen_avx2. Searching online, I found the source code here: https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/multiarch/strlen-avx2.S

It's an implementation of strlen optimized with AVX2 instructions, and it's all handwritten assembly code. When I tested it out, this function only executed 200 instructions to find the length of a 2048-character string. The "regular" versions of strlen would have taken many thousands of executed instructions to do the same.

Also, the code for strlen isn't placed into your program because it's dynamically linked. This means the operating system stores its own copy of the function and your program simply jumps to it whenever it's needed. This also means the operating system can replace a call to "plain" strlen with an optimized version of it - in this case one using AVX2.

Gat Pelsinger · February 16

@dcgreen2k

Wow, looks like I should actually trust the C libraries . I was questioning them because the implementation I saw that I assumed was being used didn't seem to be that optimized, but not when the real function which is being used is literally coded right in AVX2 assembly. No matter how optimized C code you right, unless you get your hands dirty using assembly, you ain't achieving anything better.

Sign In

How is this so fast? <C>

Link to comment

Share on other sites

Link to post

Share on other sites

Link to comment

Share on other sites

Link to post

Share on other sites

Link to comment

Share on other sites

Link to post

Share on other sites

Link to comment

Share on other sites

Link to post

Share on other sites

Link to comment

Share on other sites

Link to post

Share on other sites

Link to comment

Share on other sites

Link to post

Share on other sites

Link to comment

Share on other sites

Link to post

Share on other sites

Create an account or sign in to comment

Create an account

Sign in

Featured Topics

Topics

Latest From Linus Tech Tips:

I Was Never Meant to Have This Prototype CPU

Latest From Tech Quickie:

Why Do Speakers Hiss?

Latest From TechLinked:

Intel: “It Wasn’t Me”

Latest From GameLinked:

Is Nintendo being FRAMED?

Latest From ShortCircuit:

How is this even handheld?! - OneXPlayer X1

Latest From Mac Address:

Why did you buy an Apple Vision Pro?

Latest From Channel Super Fun:

I Swapped the CEO's Assistant For a Day!