File:  [DVB] / dietlibc / x86_64 / memccpy.c
Revision 1.1: download - view: text, annotated - select for diffs
Mon Jun 6 11:48:30 2016 UTC (7 years, 11 months ago) by leitner
Branches: MAIN
CVS tags: HEAD
  add x86_64 memccpy, mempcpy
  import Debian bugfixes (Christian Seiler)

#define _POSIX_SOURCE
#define _XOPEN_SOURCE
#include <sys/types.h>
#include <string.h>
#include <emmintrin.h>

void *memccpy(void *dst, const void *src, int c, size_t count)
{
  char *a = dst;
  const char *b = src;
  size_t i;
  if (count>=16) {
    __m128i cmpval=_mm_set1_epi8((unsigned char)c);
    unsigned int mod=(count%16);
    count-=mod;
    for (i=0; i<count; ) {
      __m128i tmp=_mm_loadu_si128((__m128i*)(b+i));
      int cmp=_mm_movemask_epi8(_mm_cmpeq_epi8(cmpval,tmp));
      if (cmp) {
	int pos=__builtin_ctz(cmp);
	/* Now we need to write a partial vector.
	 * If we write the bytes one by one, we'll eat 3-4 cycles cache
	 * access penalty per byte we read, so we'd rather shift the
	 * offset backwards and read+write one vector. See if we copied
	 * enough data already for that not to be a buffer underflow */
	if (i+pos>=15) {
	  /* yes! */
	  _mm_storeu_si128((__m128i*)(a+i+pos-15),_mm_loadu_si128((__m128i*)(b+i+pos-15)));
	  return a+i+pos+1;
	}
	/* nope :-( */
	count=i+pos+1;
	for (; i<count; ++i)
	  a[i]=b[i];
	return a+i;
      }
      _mm_storeu_si128((__m128i*)(a+i),tmp);
      i+=mod; mod=16;
    }
    return 0;
  }
  for (i=0; i<count; ++i)
    if ((a[i]=b[i])==c)
      return a+i+1;
  return 0;
}

LinuxTV legacy CVS <linuxtv.org/cvs>