File:  [DVB] / dietlibc / lib / c16rtomb.c
Revision 1.1: download - view: text, annotated - select for diffs
Thu Oct 3 15:39:15 2019 UTC (4 years, 8 months ago) by leitner
Branches: MAIN
CVS tags: HEAD
  add stdalign.h
  add uchar.h and c16rtomb, c32rtomb, mbrtoc16, mbrtoc32
  add static_assert to <assert.h>

#include <uchar.h>
#include <errno.h>

/* convert one utf-16 char to utf-8 */
/* if it is a surrogate pair, we just eat the char, update our state and
 * don't output anything */

/* state (in ps) can be NULL, in which case we use a static var */

/* s can be NULL, in which case we don't write anything, but pretend to
 * write 0 to an imaginary buffer, we reset the state, and return how
 * many bytes we would have written */

/* we ignore the locale and always do UTF-8 */

size_t c16rtomb(char *restrict s, char16_t wc, mbstate_t *restrict ps) {
  /* handle ps==NULL by switching to an internal state */
  static mbstate_t internal;
  char32_t w;
  if (!ps) ps=&internal;

  /* utf-16 uses "surrogate pairs" to encode sequences that do not fit
   * into 16 bits. You start by substracting 0x10000, leaving 20 bits.
   * Then you write two pairs:
   *   (0xd800 + the first 10 bits, 0xdc00 + the second 10 bits)
   */

  /* is it the first half of a surrogate pair? */
  if (wc >= 0xd800 && wc <= 0xdbff) {
    /* if we already have a surrogate pair in our state, this is
     * invalid. */
    if (ps->count) {
      errno=EILSEQ;
      return -1;
    }
    ps->count = 10;
    ps->sofar = (wc & 0x3ff);	// extract lower 10 bits
    return 0;
  }

  /* is it the second half of a surrogate pair? */
  if (wc >= 0xdc00 && wc <= 0xdfff) {
    /* if we did not already have a surrogate pair in our state, this is
     * invalid. */
    if (ps->count != 10) {
      errno=EILSEQ;
      return -1;
    }
    /* take the lower 10 bits from this surrogate pair, and the upper 10
     * bits from the previous surrogate pair in the state, then add the
     * implicit 0x10000 lower bounds for surrogate-pair encoded values */
    w = (wc & 0x3ff) + (ps->sofar << 10) + 0x10000;
    ps->count = 0;

    /* the UTF-8 encoding for this range is fixed: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    if (s) {
      s[0]=0xf0 | ((w >> 18) & 7);	// three bits
      s[1]=0x80 | ((w >> 12) & 0x3f);	// six bits
      s[2]=0x80 | ((w >> 6) & 0x3f);	// next six bits
      s[3]=0x80 | (w & 0x3f);		// last six bits
    }
    return 4;
  }

  /* if we were waiting for the second half of a surrogate pair, this
   * is invalid. */
  if (ps->count) {
    errno=EILSEQ;
    return -1;
  }
  w = wc;	/* just take wc as a code point */

  if (w >= 0x800) {
    /* case 3: 0x800 - 0xffff: 1110xxxx 10xxxxxx 10xxxxxx */
    if (s) {
      s[0]=0xe0 | ((w >> 12) & 0xf);	// four bits
      s[1]=0x80 | ((w >> 6) & 0x3f);	// six bits
      s[2]=0x80 | (w & 0x3f);		// last six bits
    }
    return 3;
  }
  if (w >= 0x80) {
    /* case 2: 0x80 - 0x7ff: 110xxxxx 10xxxxxx */
    if (s) {
      s[0]=0xc0 | ((w >> 6) & 0x1f);	// five bits
      s[1]=0x80 | (w & 0x3f);		// last six bits
    }
    return 2;
  }
  /* case 1: 0-0x7f: 0xxxxxxx */
  if (s)
    s[0]=w;
  return 1;
}

#ifdef UNITTEST
#include <limits.h>
#include <assert.h>
#include <string.h>

int main() {
  char buf[MB_LEN_MAX];
  mbstate_t st = { 0 };
  /* first some positive tests */
  assert(c16rtomb(buf, 0x7a, &st) == 1 && !memcmp(buf,"\x7a",1));
  assert(c16rtomb(buf, 0xdf, &st) == 2 && !memcmp(buf,"\xc3\x9f",2));
  assert(c16rtomb(buf, 0x6c34, &st) == 3 && !memcmp(buf,"\xe6\xb0\xb4",3));
  assert(c16rtomb(buf, 0xd83c, &st) == 0);
  assert(c16rtomb(buf, 0xdf4c, &st) == 4 && !memcmp(buf,"\xf0\x9f\x8d\x8c",4));

  /* now check error cases */
  assert(c16rtomb(buf, 0xdf4c, &st) == -1 && errno==EILSEQ);	// second half of pair without first half seen

  memset(&st,0,sizeof(st));
  assert(c16rtomb(buf, 0xd83c, &st) == 0);
  assert(c16rtomb(buf, 0xd83c, &st) == -1 && errno==EILSEQ);	// first half twice

  memset(&st,0,sizeof(st));
  assert(c16rtomb(buf, 0xd83c, &st) == 0);
  assert(c16rtomb(buf, 'x', &st) == -1 && errno==EILSEQ);	// second half of pair missing
}
#endif

LinuxTV legacy CVS <linuxtv.org/cvs>