/* sml3_utf8.c: UTF8 Funktionen */

/* Copyright 2025 Kurt Nienhaus
 *
 * This file is part of libsammel3.
 * libsammel3 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 * libsammel3 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with libsammel3.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <locale.h>
#include <langinfo.h>
#include <wchar.h>
#include <wctype.h>
#include "config.h"
#include "sml3_fehler.h"
#include "sml3_util.h"
#include "sml3_utf8.h"

struct ndl_wc {
  int ndlsize, ndlmax, ndlpos;
  struct {
    enum { NDL_NONE = 0, NDL_AL, NDL_AH, NDL_B, NDL_D, NDL_P } type;
    wchar_t wc;
  } *e;
};

static const int category = LC_CTYPE;

static char * utf8_setlocale(void);
static void utf8_restorelocale(char *);
static int utf8_to_wc(wchar_t *, const char *, size_t);
static int wc_to_utf8(wchar_t, char *, int *);
static int ndl_set(struct ndl_wc *, const char *, size_t, int, int);
static int ndl_check(struct ndl_wc *, wchar_t, int);
static void ndl_free(struct ndl_wc *);
static size_t utf8_strnlen(const char *, size_t *);
static char * utf8_strchr(const char *, int, int);
static char * utf8_strrchr(const char *, int, int);
static char * utf8_str_pbrk_spn_cspn(const char *, const char *, int, int);
static char * utf8_strstr(const char *, const char *, int);
static int utf8_strncmp(const char *, const char *, size_t, int);

int SML3_utf8_codierung(const char *);
int SML3_utf8_next(const char *, size_t, int *);
int SML3_utf8_from_codepoint(int, char *);
int SML3_utf8_gettyp(int, struct SML3_utf8_typ *);
int SML3_utf8_toupper(int);
int SML3_utf8_tolower(int);
size_t SML3_utf8_strlen(const char *);
size_t SML3_utf8_strnlen(const char *, size_t *);
char * SML3_utf8_strchr(const char *, int);
char * SML3_utf8_strcasechr(const char *, int);
char * SML3_utf8_strrchr(const char *, int);
char * SML3_utf8_strcaserchr(const char *, int);
char * SML3_utf8_strpbrk(const char *, const char *);
char * SML3_utf8_strcasepbrk(const char *, const char *);
size_t SML3_utf8_strspn(const char *, const char *);
size_t SML3_utf8_strcasespn(const char *, const char *);
size_t SML3_utf8_strcspn(const char *, const char *);
size_t SML3_utf8_strcasecspn(const char *, const char *);
char * SML3_utf8_strstr(const char *, const char *);
char * SML3_utf8_strcasestr(const char *, const char *);
int SML3_utf8_strcmp(const char *, const char *);
int SML3_utf8_strcasecmp(const char *, const char *);
int SML3_utf8_strncmp(const char *, const char *, size_t);
int SML3_utf8_strncasecmp(const char *, const char *, size_t);


/* setzt Locale zu UTF8 */
static char *
utf8_setlocale(void)
{
  char *origlocale;
  char *codeset, *oloc;

  origlocale = setlocale(category, NULL);
  if (origlocale == NULL) { SML3_fehlernew(EINVAL, "setlocale returns NULL"); return NULL; }
  origlocale = SML3_strdup(origlocale);

  oloc = setlocale(category, "");
  codeset = nl_langinfo(CODESET);
  if (strcmp(codeset, "UTF-8") != 0) {
    if (oloc != NULL) {
      char locbuf[32];
      size_t olen = strcspn(oloc, ".");
      snprintf(locbuf, sizeof(locbuf), "%.*s.UTF-8", (int)olen, oloc);
      oloc = setlocale(category, locbuf);
    } else {
      oloc = NULL;
    }
    if (oloc == NULL) { oloc = setlocale(category, "C.UTF-8"); }
    if (oloc == NULL) { oloc = setlocale(category, "de_DE.UTF-8"); }
    if (oloc == NULL) { oloc = setlocale(category, "en_US.UTF-8"); }
    if (oloc == NULL) { SML3_fehlernew(EINVAL, "UTF-8 is not supported"); return NULL; }
  }

  return origlocale;
} /* Ende utf8_setlocale */


/* stellt originale Locale wieder her */
static void
utf8_restorelocale(char *origlocale)
{
  if (origlocale == NULL) { return; }
  setlocale(category, origlocale);
  free(origlocale);
} /* Ende utf8_restorelocale */


/* UTF8-Zeichen zu wchar_t */
static int
utf8_to_wc(wchar_t *wc, const char *cbuf, size_t clen)
{
  mbstate_t mbstate;
  int erg;

  if (wc == NULL || cbuf == NULL || clen == 0) { return 0; }

  memset(&mbstate, 0, sizeof(mbstate));

  erg = mbrtowc(wc, cbuf, clen, &mbstate);
  if (erg < 0) { fprintf(stderr, "mbrtowc(%.*s): %s\n", (int)clen, cbuf, strerror(errno)); }
  if (erg <= 0) { return 0; }

  return 1;
} /* Ende utf8_to_wc */


/* wchar_t zu UTF8-Zeichen */
static int
wc_to_utf8(wchar_t wc, char *cbuf, int *codept)
{
  mbstate_t mbstate;
  size_t clen;

  if (cbuf == NULL) { return 0; }
  if (codept != NULL) { *codept = 0; }

  memset(&mbstate, 0, sizeof(mbstate));

  clen = wcrtomb(cbuf, wc, &mbstate);
  if (clen == (size_t)-1) { return 0; }

  if (codept != NULL && clen > 0) {
    if (SML3_utf8_next(cbuf, clen, codept) <= 0) { return 0; }
  }

  return 1;
} /* Ende wc_to_utf8 */


/* setzt ndlwc aus needle, Rueckgabe: ob erfolgreich */
static int
ndl_set(struct ndl_wc *ndlwc, const char *needle, size_t nlen, int icase, int withformat)
{
  const int ndlplus = 16;
  int clen;
  wchar_t wc;

  if (ndlwc == NULL) { return 0; }
  memset(ndlwc, 0, sizeof(*ndlwc));
  if (needle == NULL || nlen == 0) { return 1; }

  for (;;) {
    clen = SML3_utf8_next(needle, nlen, NULL);
    if (clen == 0) { break; }
    if (clen < 0) {
      fprintf(stderr, "SML3_utf8_next(): invalid utf8-character: %.*s\n", (int)(nlen > 8 ? 8 : nlen), needle);
      if (ndlwc->ndlsize > 0 && ndlwc->e != NULL) { free(ndlwc->e); }
      memset(ndlwc, 0, sizeof(*ndlwc));
      return 0;
    }
    if (!utf8_to_wc(&wc, needle, clen)) {
      if (ndlwc->ndlsize > 0 && ndlwc->e != NULL) { free(ndlwc->e); }
      memset(ndlwc, 0, sizeof(*ndlwc));
      return 0;
    }
    if (icase) { wc = towlower(wc); }

    if (ndlwc->ndlsize == 0) {
      ndlwc->e = SML3_malloc(sizeof(*ndlwc->e) * ndlplus);
      ndlwc->ndlmax = 1;
    } else {
      ndlwc->e = SML3_realloc(ndlwc->e, sizeof(*ndlwc->e) * (ndlwc->ndlsize + ndlplus));
      ndlwc->ndlmax++;
    }
    ndlwc->ndlsize += ndlplus;

    ndlwc->e[ndlwc->ndlmax - 1].type = NDL_NONE;
    ndlwc->e[ndlwc->ndlmax - 1].wc = wc;

    if (withformat && clen == 1 && needle[0] == '%') {
      if (needle[1] == 'a') {
        ndlwc->e[ndlwc->ndlmax - 1].type = NDL_AL;
        clen++;
      } else if (needle[1] == 'A') {
        ndlwc->e[ndlwc->ndlmax - 1].type = NDL_AH;
        clen++;
      } else if (needle[1] == 'b') {
        ndlwc->e[ndlwc->ndlmax - 1].type = NDL_B;
        clen++;
      } else if (needle[1] == 'd') {
        ndlwc->e[ndlwc->ndlmax - 1].type = NDL_D;
        clen++;
      } else if (needle[1] == 'p') {
        ndlwc->e[ndlwc->ndlmax - 1].type = NDL_P;
        clen++;
      } else if (needle[1] == '%') {
        clen++;
      }
    }

    needle += clen;
    nlen -= clen;
  }

  return 1;
} /* Ende ndl_set */


/* wc mit aktuellem ndlwc vergleichen, Rueckgabe: 1 = Treffer, 0 = kein Treffer */
static int
ndl_check(struct ndl_wc *ndlwc, wchar_t wc, int icase)
{
  int retw;

  if (ndlwc == NULL) { return 0; }
  if (ndlwc->ndlsize == 0 && ndlwc->e == NULL) { return 0; }

  retw = 0;
  if (ndlwc->e[ndlwc->ndlpos].type != NDL_NONE) {
    if (ndlwc->e[ndlwc->ndlpos].type == NDL_AL) {
      if (iswalpha(wc) && (icase || iswlower(wc))) { retw = 1; }
    } else if (ndlwc->e[ndlwc->ndlpos].type == NDL_AH) {
      if (iswalpha(wc) && (icase || iswupper(wc))) { retw = 1; }
    } else if (ndlwc->e[ndlwc->ndlpos].type == NDL_B) {
      if (iswblank(wc)) { retw = 1; }
    } else if (ndlwc->e[ndlwc->ndlpos].type == NDL_D) {
      if (iswdigit(wc)) { retw = 1; }
    } else if (ndlwc->e[ndlwc->ndlpos].type == NDL_P) {
      if (iswpunct(wc)) { retw = 1; }
    }
  } else {
    if (wc == ndlwc->e[ndlwc->ndlpos].wc) { retw = 1; }
  }

  return retw;
} /* Ende ndl_check */


/* ndlwc freigeben */
static void
ndl_free(struct ndl_wc *ndlwc)
{
  if (ndlwc == NULL) { return; }
  if (ndlwc->ndlsize > 0 && ndlwc->e != NULL) { free(ndlwc->e); }
  memset(ndlwc, 0, sizeof(*ndlwc));
} /* Ende ndl_free */


/* SML3_utf8_codierung:
 * gibt zurueck, ob der String ASCII, 8-Bytes oder UTF-8 ist
 * 1.Arg: String
 * Rueckgabe: enum SML3_UTF8_CODE
 */
int
SML3_utf8_codierung(const char *string)
{
  const char *tptr;
  size_t tsize;
  int tbyt, iscode;

  tptr = string;
  tsize = strlen(string);
  iscode = SML3_UTF8_CODE_ASCII;

  for (;;) {
    tbyt = SML3_utf8_next(tptr, tsize, NULL);
    if (tbyt <= 0) { break; }
    if (tbyt > 1) { iscode = SML3_UTF8_CODE_UTF8; }
    tptr += tbyt;
    tsize -= tbyt;
  }

  if (tbyt < 0) { iscode = SML3_UTF8_CODE_8BYTE; }

  return iscode;
} /* Ende SML3_utf8_codierung */


/* SML3_utf8_next:
 * naechstes UTF8-Zeichen erhalten
 * 1.Arg: String
 * 2.Arg: Stringlaenge
 * 3.Arg: fuer Rueckgabe Unicode-Codepoint des UTF8-Zeichens, wenn nicht NULL
 * Rueckgabe: Anzahl Bytes des UTF8-Zeichens
 *            oder 0 = Ende
 *            oder -1 = ungueltiges Zeichen
 *
 * Beispiel:
 *   const char *string = "Word";
 *   const char *strptr;
 *   size_t strsize;
 *   int no_bytes, codept;
 *
 *   strptr = string;
 *   strsize = strlen(strptr);
 *   for (;;) {
 *     no_bytes = SML3_utf8_next(strptr, strsize, &codept);
 *     if (no_bytes <= 0) { break; }
 *     printf("UTF8-Zeichen: codept = %d, Anzahl Bytes = %d: %.*s\n", codept, no_bytes, no_bytes, strptr);
 *     strptr += no_bytes;
 *     strsize -= no_bytes;
 *   }
 */
int
SML3_utf8_next(const char *string, size_t size, int *codept)
{
  unsigned char *uc;
  int anzz;

  if (string == NULL || size == 0) { return 0; }

  uc = (unsigned char *)string;
  if (uc[0] < 0x80) {
    if (codept != NULL) { *codept = (int)uc[0]; }
    return 1;
  }
  if (uc[0] < 0xc2) { return -1; }

  if (uc[0] < 0xe0) {
    anzz = 2;
    if (size < (size_t)anzz) { return -1; }
    if (uc[1] < 0x80 || uc[1] > 0xbf) { return -1; }
    if (codept != NULL) { *codept = ((uc[0] & 0x1f) << 6) | (uc[1] & 0x3f); }
    return anzz;
  }

  if (uc[0] < 0xf0) {
    anzz = 3;
    if (size < (size_t)anzz) { return -1; }
    if (uc[1] < 0x80 || uc[1] > 0xbf) { return -1; }
    if (uc[2] < 0x80 || uc[2] > 0xbf) { return -1; }
    if (codept != NULL) { *codept = ((uc[0] & 0x0f) << 12) | ((uc[1] & 0x3f) << 6) | (uc[2] & 0x3f); }
    return anzz;
  }

  if (uc[0] < 0xf8) {
    anzz = 4;
    if (size < (size_t)anzz) { return -1; }
    if (uc[1] < 0x80 || uc[1] > 0xbf) { return -1; }
    if (uc[2] < 0x80 || uc[2] > 0xbf) { return -1; }
    if (uc[3] < 0x80 || uc[3] > 0xbf) { return -1; }
    if (codept != NULL) { *codept = ((uc[0] & 0x07) << 18) | ((uc[1] & 0x3f) << 12) | ((uc[2] & 0x3f) << 6) | (uc[3] & 0x3f); }
    return anzz;
  }

  if (uc[0] < 0xfc) {
    anzz = 5;
    if (size < (size_t)anzz) { return -1; }
    if (uc[1] < 0x80 || uc[1] > 0xbf) { return -1; }
    if (uc[2] < 0x80 || uc[2] > 0xbf) { return -1; }
    if (uc[3] < 0x80 || uc[3] > 0xbf) { return -1; }
    if (uc[4] < 0x80 || uc[4] > 0xbf) { return -1; }
    if (codept != NULL) { *codept = ((uc[0] & 0x03) << 24) | ((uc[1] & 0x3f) << 18) | ((uc[2] & 0x3f) << 12) | ((uc[3] & 0x3f) << 6) | (uc[4] & 0x3f); }
    return anzz;
  }

  if (uc[0] < 0xfe) {
    anzz = 6;
    if (size < (size_t)anzz) { return -1; }
    if (uc[1] < 0x80 || uc[1] > 0xbf) { return -1; }
    if (uc[2] < 0x80 || uc[2] > 0xbf) { return -1; }
    if (uc[3] < 0x80 || uc[3] > 0xbf) { return -1; }
    if (uc[4] < 0x80 || uc[4] > 0xbf) { return -1; }
    if (uc[5] < 0x80 || uc[5] > 0xbf) { return -1; }
    if (codept != NULL) { *codept = ((uc[0] & 0x01) << 30) | ((uc[1] & 0x3f) << 24) | ((uc[2] & 0x3f) << 18) | ((uc[3] & 0x3f) << 12) | ((uc[4] & 0x3f) << 6) | (uc[5] & 0x3f); }
    return anzz;
  }

  return -1;
} /* Ende SML3_utf8_next */


/* SML3_utf8_from_codepoint:
 * setzt UTF8-character vom Unicode-Codepoint
 * 1.Arg: Unicode-Codepoint
 * 2.Arg: fuer Rueckgabe UTF8-Zeichen, wenn nicht NULL,
 *        sollte mindestens 6 Bytes Platz haben
 * Rueckgabe: Anzahl Bytes des UTF8-Zeichens, oder 0 = ungueltig
 *
 * Beispiel:
 *   char cbuf[8];
 *   int no_bytes, codept;
 *
 *   SML3_utf8_next("X", sizeof("X"), &codept);  // Unicode-Codepoint von X erhalten
 *   no_bytes = SML3_utf8_from_codepoint(codept, cbuf);
 *   printf("UTF8-Zeichen: codept = %d, Anzahl Bytes = %d: %.*s\n", codept, no_bytes, no_bytes, cbuf);
 */
int
SML3_utf8_from_codepoint(int codept, char *cbuf)
{
  char ucb[8];
  int ranz, i1;

  if (codept < 0) { return 0; }

  if (codept < 0x80) {
    ucb[0] = (char)codept;
    ranz = 1;
  } else if (codept < 0x800) {
    ucb[0] = ((codept >> 6) & 0x1f) | 0xc0;
    ucb[1] = (codept & 0x3f) | 0x80;
    ranz = 2;
  } else if (codept < 0x10000) {
    ucb[0] = ((codept >> 12) & 0x0f) | 0xe0;
    ucb[1] = ((codept >> 6) & 0x3f) | 0x80;
    ucb[2] = (codept & 0x3f) | 0x80;
    ranz = 3;
  } else if (codept < 0x200000) {
    ucb[0] = ((codept >> 18) & 0x07) | 0xf0;
    ucb[1] = ((codept >> 12) & 0x3f) | 0x80;
    ucb[2] = ((codept >> 6) & 0x3f) | 0x80;
    ucb[3] = (codept & 0x3f) | 0x80;
    ranz = 4;
  } else if (codept < 0x4000000) {
    ucb[0] = ((codept >> 24) & 0x03) | 0xf8;
    ucb[1] = ((codept >> 18) & 0x3f) | 0x80;
    ucb[2] = ((codept >> 12) & 0x3f) | 0x80;
    ucb[3] = ((codept >> 6) & 0x3f) | 0x80;
    ucb[4] = (codept & 0x3f) | 0x80;
    ranz = 5;
  } else {
    ucb[0] = ((codept >> 30) & 0x01) | 0xfc;
    ucb[1] = ((codept >> 24) & 0x3f) | 0x80;
    ucb[2] = ((codept >> 18) & 0x3f) | 0x80;
    ucb[3] = ((codept >> 12) & 0x3f) | 0x80;
    ucb[4] = ((codept >> 6) & 0x3f) | 0x80;
    ucb[5] = (codept & 0x3f) | 0x80;
    ranz = 6;
  }

  if (cbuf != NULL) {
    for (i1 = 0; i1 < ranz; i1++) {
      cbuf[i1] = ucb[i1];
    }
  }

  return ranz;
} /* Ende SML3_utf8_from_codepoint */


/* SML3_utf8_gettyp:
 * Eigenschaften des UTF8-Zeichens erhalten
 * 1.Arg: Unicode-Codepoint
 * 2.Arg: fuer Rueckgabe Eigenschaften
 * Rueckgabe: 1 = OK oder 0 = ungueltiges Zeichen (oder Ende-0)
 */
int
SML3_utf8_gettyp(int codept, struct SML3_utf8_typ *utype)
{
  char *origlocale;
  char cbuf[8];
  int clen;
  wchar_t wc;

  if (utype != NULL) { memset(utype, 0, sizeof(*utype)); }

  /* setze UTF8-locale */
  origlocale = utf8_setlocale();
  if (origlocale == NULL) { fprintf(stderr, "SML3_utf8_gettyp(): %s\n", SML3_fehlermsg()); return 0; }

  /* erhalte wchar_t aus Unicode-Codepoint */
  clen = SML3_utf8_from_codepoint(codept, cbuf);
  if (!utf8_to_wc(&wc, cbuf, clen)) { utf8_restorelocale(origlocale); return 0; }

  if (utype != NULL) {
    if (iswalpha(wc)) { utype->is_alpha = 1; }
    if (iswblank(wc)) { utype->is_blank = 1; }
    if (iswcntrl(wc)) { utype->is_cntrl = 1; }
    if (iswdigit(wc)) { utype->is_digit = 1; }
    if (iswgraph(wc)) { utype->is_graph = 1; }
    if (iswlower(wc)) { utype->is_lower = 1; }
    if (iswprint(wc)) { utype->is_print = 1; }
    if (iswpunct(wc)) { utype->is_punct = 1; }
    if (iswspace(wc)) { utype->is_space = 1; }
    if (iswupper(wc)) { utype->is_upper = 1; }
    if (iswxdigit(wc)) { utype->is_xdigit = 1; }
  }

  utf8_restorelocale(origlocale);
  return 1;
} /* Ende SML3_utf8_gettyp */


/* SML3_utf8_toupper:
 * UTF8-Zeichen gross konvertieren
 * 1.Arg: Unicode-Codepoint
 * Rueckgabe: 1.Arg grossgeschrieben oder 1.Arg unveraendert
 */
int
SML3_utf8_toupper(int codept)
{
  char *origlocale;
  char cbuf[8];
  int clen;
  wchar_t wc;

  /* setze UTF8-locale */
  origlocale = utf8_setlocale();
  if (origlocale == NULL) { return codept; }

  /* erhalte wchar_t aus Unicode-Codepoint */
  clen = SML3_utf8_from_codepoint(codept, cbuf);
  if (!utf8_to_wc(&wc, cbuf, clen)) { utf8_restorelocale(origlocale); return codept; }

  wc = towupper(wc);

  /* erhalte Unicode-Codepoint aus wchar_t */
  if (!wc_to_utf8(wc, cbuf, &codept)) { utf8_restorelocale(origlocale); return codept; }

  utf8_restorelocale(origlocale);
  return codept;
} /* Ende SML3_utf8_toupper */


/* SML3_utf8_tolower:
 * UTF8-Zeichen klein konvertieren
 * 1.Arg: Unicode-Codepoint
 * Rueckgabe: 1.Arg kleingeschrieben oder 1.Arg unveraendert
 */
int
SML3_utf8_tolower(int codept)
{
  char *origlocale;
  char cbuf[8];
  int clen;
  wchar_t wc;

  /* setze UTF8-locale */
  origlocale = utf8_setlocale();
  if (origlocale == NULL) { return codept; }

  /* erhalte wchar_t aus Unicode-Codepoint */
  clen = SML3_utf8_from_codepoint(codept, cbuf);
  if (!utf8_to_wc(&wc, cbuf, clen)) { utf8_restorelocale(origlocale); return codept; }

  wc = towlower(wc);

  /* erhalte Unicode-Codepoint aus wchar_t */
  if (!wc_to_utf8(wc, cbuf, &codept)) { utf8_restorelocale(origlocale); return codept; }

  utf8_restorelocale(origlocale);
  return codept;
} /* Ende SML3_utf8_tolower */


/* SML3_utf8_strlen:
 * Anzahl UTF8-Zeichen im UTF8-String zurueckgeben
 * 1.Arg: String
 * Rueckgabe: Anzahl UTF8-Zeichen
 */
size_t
SML3_utf8_strlen(const char *string)
{
  return utf8_strnlen(string, NULL);
} /* Ende SML3_utf8_strlen */


/* SML3_utf8_strnlen:
 * Anzahl UTF8-Zeichen im UTF8-String zurueckgeben
 * 1.Arg: String
 * 2.Arg: Uebergabe: maximale Anzahl Bytes im 1.Arg auswerten
 *        Rueckgabe: Anzahl Bytes im 1.Arg mit vollstaendigen UTF8-Zeichen
 * Rueckgabe: Anzahl UTF8-Zeichen
 */
size_t
SML3_utf8_strnlen(const char *string, size_t *nmax)
{
  return utf8_strnlen(string, nmax);
} /* Ende SML3_utf8_strnlen */


/* Anzahl UTF8-Zeichen im UTF8-String */
static size_t
utf8_strnlen(const char *string, size_t *nmax)
{
  int clen;
  size_t slen, zanz;

  if (string == NULL) { return 0; }
  if (nmax != NULL) {
    slen = *nmax;
  } else {
    slen = strlen(string);
  }
  if (slen == 0) { return 0; }

  /* zaehle UTF8-Zeichen im String */
  for (zanz = 0;; zanz++) {
    clen = SML3_utf8_next(string, slen, NULL);
    if (clen <= 0) { break; }
    string += clen;
    slen -= clen;
  }
  if (nmax != NULL) { *nmax -= slen; }

  return zanz;
} /* Ende utf8_strnlen */


/* SML3_utf8_strchr:
 * UTF8-Zeichen im UTF8-String suchen
 * 1.Arg: String
 * 2.Arg: Unicode-Codepoint des UTF8-Zeichens
 * Rueckgabe: Pointer auf UTF8-Zeichen im String, oder NULL
 */
char *
SML3_utf8_strchr(const char *string, int codept)
{
  return utf8_strchr(string, codept, 0);
} /* Ende SML3_utf8_strchr */


/* SML3_utf8_strcasechr:
 * UTF8-Zeichen insensitive im UTF8-String suchen
 * 1.Arg: String
 * 2.Arg: Unicode-Codepoint des UTF8-Zeichens
 * Rueckgabe: Pointer auf UTF8-Zeichen im String, oder NULL
 */
char *
SML3_utf8_strcasechr(const char *string, int codept)
{
  return utf8_strchr(string, codept, 1);
} /* Ende SML3_utf8_strcasechr */


/* UTF8-Zeichen (in)sensitive im UTF8-String suchen */
static char *
utf8_strchr(const char *string, int codept, int icase)
{
  char *origlocale;
  char cbuf[8];
  int clen;
  wchar_t wch, wcn;
  size_t slen;

  if (string == NULL) { return NULL; }
  slen = strlen(string);
  if (codept == 0) { return (char *)(string + slen); }
  if (slen == 0) { return NULL; }

  /* setze UTF8-locale */
  origlocale = utf8_setlocale();
  if (origlocale == NULL) { fprintf(stderr, "utf8_strchr(): %s\n", SML3_fehlermsg()); return NULL; }

  /* erhalte wchar_t aus Unicode-Codepoint, mache es evtl. klein */
  clen = SML3_utf8_from_codepoint(codept, cbuf);
  if (!utf8_to_wc(&wcn, cbuf, clen)) { utf8_restorelocale(origlocale); return NULL; }
  if (icase) { wcn = towlower(wcn); }

  /* suche nach Unicode-Codepoint im String */
  for (;;) {
    clen = SML3_utf8_next(string, slen, NULL);
    if (clen <= 0) { string = NULL; break; }
    if (!utf8_to_wc(&wch, string, clen)) { string = NULL; break; }
    if (icase) { wch = towlower(wch); }
    if (wch == wcn) { break; }
    string += clen;
    slen -= clen;
  }

  utf8_restorelocale(origlocale);
  return (char *)string;
} /* Ende utf8_strchr */


/* SML3_utf8_strrchr:
 * letztes UTF8-Zeichen im UTF8-String suchen
 * 1.Arg: String
 * 2.Arg: Unicode-Codepoint des UTF8-Zeichens
 * Rueckgabe: Pointer auf UTF8-Zeichen im String, oder NULL
 */
char *
SML3_utf8_strrchr(const char *string, int codept)
{
  return utf8_strrchr(string, codept, 0);
} /* Ende SML3_utf8_strrchr */


/* SML3_utf8_strcaserchr:
 * letztes UTF8-Zeichen insensitive im UTF8-String suchen
 * 1.Arg: String
 * 2.Arg: Unicode-Codepoint des UTF8-Zeichens
 * Rueckgabe: Pointer auf UTF8-Zeichen im String, oder NULL
 */
char *
SML3_utf8_strcaserchr(const char *string, int codept)
{
  return utf8_strrchr(string, codept, 1);
} /* Ende SML3_utf8_strcaserchr */


/* letztes UTF8-Zeichen (in)sensitive im UTF8-String suchen */
static char *
utf8_strrchr(const char *string, int codept, int icase)
{
  char *origlocale;
  char cbuf[8];
  int clen;
  wchar_t wch, wcn;
  size_t slen;
  const char *eptr;

  if (string == NULL) { return NULL; }
  slen = strlen(string);
  if (codept == 0) { return (char *)(string + slen); }
  if (slen == 0) { return NULL; }

  /* setze UTF8-locale */
  origlocale = utf8_setlocale();
  if (origlocale == NULL) { fprintf(stderr, "utf8_strrchr(): %s\n", SML3_fehlermsg()); return NULL; }

  /* erhalte wchar_t aus Unicode-Codepoint, mache es evtl. klein */
  clen = SML3_utf8_from_codepoint(codept, cbuf);
  if (!utf8_to_wc(&wcn, cbuf, clen)) { utf8_restorelocale(origlocale); return NULL; }
  if (icase) { wcn = towlower(wcn); }

  /* suche nach Unicode-Codepoint von hinten im String */
  for (eptr = string + slen - 1, slen = 1; eptr >= string; eptr--, slen++) {
    if ((unsigned char)*eptr >= 0x80 && (unsigned char)*eptr <= 0xbf) { continue; }
    clen = SML3_utf8_next(eptr, slen, NULL);
    if (clen <= 0) { eptr = NULL; break; }
    if (!utf8_to_wc(&wch, eptr, clen)) { eptr = NULL; break; }
    if (icase) { wch = towlower(wch); }
    if (wch == wcn) { break; }
    slen = 0;
  }

  if (eptr == NULL || eptr < string) { eptr = NULL; }

  utf8_restorelocale(origlocale);
  return (char *)eptr;
} /* Ende utf8_strrchr */


/* SML3_utf8_strpbrk:
 * Auswahl von UTF8-Zeichen im UTF8-String suchen
 * 1.Arg: String
 * 2.Arg: Auswahl von UTF8-Zeichen
 *        oder %a = Klein-Buchstabe
 *             %A = Gross-Buchstabe
 *             %b = Leerzeichen
 *             %d = Ziffer
 *             %p = sonstiges druckbares Zeichen
 *             %% = Prozent
 * Rueckgabe: Pointer auf gefundenes UTF8-Zeichen im String, oder NULL
 */
char *
SML3_utf8_strpbrk(const char *string, const char *accept)
{
  char *retp = utf8_str_pbrk_spn_cspn(string, accept, 0, 1);
  if (retp != NULL && *retp == '\0') { retp = NULL; }
  return retp;
} /* Ende SML3_utf8_strpbrk */


/* SML3_utf8_strcasepbrk:
 * Auswahl von UTF8-Zeichen insensitive im UTF8-String suchen
 * 1.Arg: String
 * 2.Arg: Auswahl von UTF8-Zeichen
 *        oder %a = Buchstabe
 *             %A = Buchstabe
 *             %b = Leerzeichen
 *             %d = Ziffer
 *             %p = sonstiges druckbares Zeichen
 *             %% = Prozent
 * Rueckgabe: Pointer auf gefundenes UTF8-Zeichen im String, oder NULL
 */
char *
SML3_utf8_strcasepbrk(const char *string, const char *accept)
{
  char *retp = utf8_str_pbrk_spn_cspn(string, accept, 1, 1);
  if (retp != NULL && *retp == '\0') { retp = NULL; }
  return retp;
} /* Ende SML3_utf8_strcasepbrk */


/* SML3_utf8_strspn:
 * wie strspn fuer UTF8
 * 1.Arg: String
 * 2.Arg: Auswahl von UTF8-Zeichen
 *        oder %a = Klein-Buchstabe
 *             %A = Gross-Buchstabe
 *             %b = Leerzeichen
 *             %d = Ziffer
 *             %p = sonstiges druckbares Zeichen
 *             %% = Prozent
 * Rueckgabe: Anzahl Bytes
 */
size_t
SML3_utf8_strspn(const char *string, const char *accept)
{
  const char *nstr = utf8_str_pbrk_spn_cspn(string, accept, 0, 0);
  if (nstr == NULL) { return 0; }
  return (size_t)(nstr - string);
} /* Ende SML3_utf8_strspn */


/* SML3_utf8_strcasespn:
 * wie strspn fuer UTF8, aber insensitive
 * 1.Arg: String
 * 2.Arg: Auswahl von UTF8-Zeichen
 *        oder %a = Buchstabe
 *             %A = Buchstabe
 *             %b = Leerzeichen
 *             %d = Ziffer
 *             %p = sonstiges druckbares Zeichen
 *             %% = Prozent
 * Rueckgabe: Anzahl Bytes
 */
size_t
SML3_utf8_strcasespn(const char *string, const char *accept)
{
  const char *nstr = utf8_str_pbrk_spn_cspn(string, accept, 1, 0);
  if (nstr == NULL) { return 0; }
  return (size_t)(nstr - string);
} /* Ende SML3_utf8_strcasespn */


/* SML3_utf8_strcspn:
 * wie strcspn fuer UTF8
 * 1.Arg: String
 * 2.Arg: Auswahl von UTF8-Zeichen
 *        oder %a = Klein-Buchstabe
 *             %A = Gross-Buchstabe
 *             %b = Leerzeichen
 *             %d = Ziffer
 *             %p = sonstiges druckbares Zeichen
 *             %% = Prozent
 * Rueckgabe: Anzahl Bytes
 */
size_t
SML3_utf8_strcspn(const char *string, const char *reject)
{
  const char *nstr = utf8_str_pbrk_spn_cspn(string, reject, 0, 1);
  if (nstr == NULL) { return 0; }
  return (size_t)(nstr - string);
} /* Ende SML3_utf8_strcspn */


/* SML3_utf8_strcasecspn:
 * wie strcspn fuer UTF8, aber insensitive
 * 1.Arg: String
 * 2.Arg: Auswahl von UTF8-Zeichen
 *        oder %a = Buchstabe
 *             %A = Buchstabe
 *             %b = Leerzeichen
 *             %d = Ziffer
 *             %p = sonstiges druckbares Zeichen
 *             %% = Prozent
 * Rueckgabe: Anzahl Bytes
 */
size_t
SML3_utf8_strcasecspn(const char *string, const char *reject)
{
  const char *nstr = utf8_str_pbrk_spn_cspn(string, reject, 1, 1);
  if (nstr == NULL) { return 0; }
  return (size_t)(nstr - string);
} /* Ende SML3_utf8_strcasecspn */


/* Auswahl von UTF8-Zeichen (in)sensitive im UTF8-String suchen */
static char *
utf8_str_pbrk_spn_cspn(const char *string, const char *needle, int icase, int ret_on_found)
{
  char *origlocale;
  int clen;
  wchar_t wc;
  size_t slen, nlen;
  struct ndl_wc ndlwc;

  if (string == NULL || needle == NULL) { return NULL; }
  slen = strlen(string);
  nlen = strlen(needle);
  if (nlen == 0) { return (char *)(string + slen); }
  if (slen == 0) { return NULL; }

  /* setze UTF8-locale */
  origlocale = utf8_setlocale();
  if (origlocale == NULL) { fprintf(stderr, "utf8_str_pbrk_spn_cspn(): %s\n", SML3_fehlermsg()); return NULL; }

  /* wandle Zeichen in needle in wchar_t um, mache sie evtl. klein */
  if (!ndl_set(&ndlwc, needle, nlen, icase, 1)) { utf8_restorelocale(origlocale); return NULL; }

  /* suche nach Zeichen von needle im String */
  for (;;) {
    clen = SML3_utf8_next(string, slen, NULL);
    if (clen <= 0) {
      if (!ret_on_found) { string = NULL; }
      break;
    }
    if (!utf8_to_wc(&wc, string, clen)) {
      if (!ret_on_found) { string = NULL; }
      break;
    }
    if (icase) { wc = towlower(wc); }
    for (ndlwc.ndlpos = 0; ndlwc.ndlpos < ndlwc.ndlmax; ndlwc.ndlpos++) {
      if (ndl_check(&ndlwc, wc, icase)) { break; }
    }
    if (ret_on_found) {
      if (ndlwc.ndlpos < ndlwc.ndlmax) { break; }  /* gefunden */
    } else {
      if (ndlwc.ndlpos == ndlwc.ndlmax) { break; }  /* nicht gefunden */
    }
    string += clen;
    slen -= clen;
  }

  ndl_free(&ndlwc);
  utf8_restorelocale(origlocale);
  return (char *)string;
} /* Ende utf8_str_pbrk_spn_cspn */


/* SML3_utf8_strstr:
 * wie strstr fuer UTF8 (unnoetig, da strstr() ausreicht)
 * 1.Arg: String
 * 2.Arg: zu suchender String
 * Rueckgabe: Pointer auf Start oder NULL = nicht gefunden
 */
char *
SML3_utf8_strstr(const char *haystack, const char *needle)
{
  return utf8_strstr(haystack, needle, 0);
} /* Ende SML3_utf8_strstr */


/* SML3_utf8_strcasestr:
 * wie strcspn fuer UTF8, aber insensitive
 * 1.Arg: String
 * 2.Arg: zu suchender String
 * Rueckgabe: Pointer auf Start oder NULL = nicht gefunden
 */
char *
SML3_utf8_strcasestr(const char *haystack, const char *needle)
{
  return utf8_strstr(haystack, needle, 1);
} /* Ende SML3_utf8_strcasestr */


/* UTF8-String (in)sensitive im UTF8-String suchen */
static char *
utf8_strstr(const char *string, const char *needle, int icase)
{
  char *origlocale;
  int clen;
  wchar_t wc;
  size_t slen, blen, nlen;
  struct ndl_wc ndlwc;
  const char *sbeg;

  if (string == NULL || needle == NULL) { return NULL; }
  slen = strlen(string);
  nlen = strlen(needle);
  if (nlen == 0) { return (char *)string; }
  if (slen == 0) { return NULL; }

  /* setze UTF8-locale */
  origlocale = utf8_setlocale();
  if (origlocale == NULL) { fprintf(stderr, "utf8_strstr(): %s\n", SML3_fehlermsg()); return NULL; }

  /* wandle Zeichen in needle in wchar_t um, mache sie evtl. klein */
  if (!ndl_set(&ndlwc, needle, nlen, icase, 0)) { utf8_restorelocale(origlocale); return NULL; }

  /* suche nach Zeichen von needle im String */
  ndlwc.ndlpos = 0;
  sbeg = string;
  blen = slen;
  for (;;) {
    clen = SML3_utf8_next(string, slen, NULL);
    if (clen <= 0) { string = NULL; break; }
    if (!utf8_to_wc(&wc, string, clen)) { string = NULL; break; }
    if (icase) { wc = towlower(wc); }
    if (ndl_check(&ndlwc, wc, icase)) {
      if (ndlwc.ndlpos == 0) { sbeg = string; blen = slen; }
      if (++ndlwc.ndlpos == ndlwc.ndlmax) { break; }
    } else {
      if (ndlwc.ndlpos > 0) {
        ndlwc.ndlpos = 0;
        string = sbeg;
        slen = blen;
        clen = SML3_utf8_next(string, slen, NULL);
      }
    }
    string += clen;
    slen -= clen;
  }

  ndl_free(&ndlwc);
  utf8_restorelocale(origlocale);

  if (string == NULL) { return NULL; }
  return (char *)sbeg;
} /* Ende utf8_strstr */


/* SML3_utf8_strcmp:
 * wie strcmp fuer UTF8 (unnoetig, da strcmp() ausreicht)
 * 1.Arg: String
 * 2.Arg: zu vergleichender String
 * Rueckgabe: -1 = 1.Arg kleiner, 0 = gleich, 1 = 1.Arg groesser
 */
int
SML3_utf8_strcmp(const char *string1, const char *string2)
{
  return utf8_strncmp(string1, string2, 0, 0);
} /* Ende SML3_utf8_strcmp */


/* SML3_utf8_strcasecmp:
 * wie strcasecmp fuer UTF8
 * 1.Arg: String
 * 2.Arg: zu vergleichender String
 * Rueckgabe: -1 = 1.Arg kleiner, 0 = gleich, 1 = 1.Arg groesser
 */
int
SML3_utf8_strcasecmp(const char *string1, const char *string2)
{
  return utf8_strncmp(string1, string2, 0, 1);
} /* Ende SML3_utf8_strcasecmp */


/* SML3_utf8_strncmp:
 * wie strncmp fuer UTF8 (unnoetig, da strncmp() ausreicht)
 * 1.Arg: String
 * 2.Arg: zu vergleichender String
 * 3.Arg: maximale Anzahl UTF8-Zeichen
 * Rueckgabe: -1 = 1.Arg kleiner, 0 = gleich, 1 = 1.Arg groesser
 */
int
SML3_utf8_strncmp(const char *string1, const char *string2, size_t umax)
{
  return utf8_strncmp(string1, string2, umax, 0);
} /* Ende SML3_utf8_strncmp */


/* SML3_utf8_strncasecmp:
 * wie strncasecmp fuer UTF8
 * 1.Arg: String
 * 2.Arg: zu vergleichender String
 * 3.Arg: maximale Anzahl UTF8-Zeichen
 * Rueckgabe: -1 = 1.Arg kleiner, 0 = gleich, 1 = 1.Arg groesser
 */
int
SML3_utf8_strncasecmp(const char *string1, const char *string2, size_t umax)
{
  return utf8_strncmp(string1, string2, umax, 1);
} /* Ende SML3_utf8_strncasecmp */


/* UTF8-String (in)sensitive mit UTF8-String vergleichen */
static int
utf8_strncmp(const char *string1, const char *string2, size_t umax, int icase)
{
  char *origlocale;
  int clen;
  wchar_t wc;
  size_t slen1, slen2, upos;
  struct ndl_wc ndlwc;
  int retw;

  if (string1 == NULL || string2 == NULL) { abort(); }
  slen1 = strlen(string1);
  slen2 = strlen(string2);
  if (slen1 == 0 && slen2 == 0) { return 0; }
  if (slen1 == 0) { return -1; }
  if (slen2 == 0) { return 1; }

  memset(&ndlwc, 0, sizeof(ndlwc));

  /* setze UTF8-locale */
  origlocale = utf8_setlocale();
  if (origlocale == NULL) { fprintf(stderr, "utf8_strcmp(): %s\n", SML3_fehlermsg()); goto cmperr; }

  /* wandle string2 in wchar_t um, mache ihn evtl. klein */
  if (!ndl_set(&ndlwc, string2, slen2, icase, 0)) { goto cmperr; }

  /* vergleiche string2 mit string1 */
  retw = 0;
  ndlwc.ndlpos = 0;
  for (upos = 0; umax == 0 || upos < umax; upos++) {
    clen = SML3_utf8_next(string1, slen1, NULL);
    if (clen < 0) { goto cmperr; }
    if (clen == 0) {
      if (ndlwc.ndlpos == ndlwc.ndlmax) { retw = 0; break; }
      retw = -1;
      break;
    }
    if (!utf8_to_wc(&wc, string1, clen)) { goto cmperr; }
    if (icase) { wc = towlower(wc); }
    if (ndlwc.ndlpos == ndlwc.ndlmax) { retw = 1; break; }
    if (!ndl_check(&ndlwc, wc, icase)) {
      if (wc < ndlwc.e[ndlwc.ndlpos].wc) {
        retw = -1;
      } else {
        retw = 1;
      }
      break;
    }
    string1 += clen;
    slen1 -= clen;
    ndlwc.ndlpos++;
  }

  if (umax > 0 && upos == umax) { retw = 0; }

  ndl_free(&ndlwc);
  utf8_restorelocale(origlocale);
  return retw;

cmperr:
  ndl_free(&ndlwc);
  utf8_restorelocale(origlocale);

  if (icase) {
    if (umax > 0) { return strncasecmp(string1, string2, umax); }
    return strcasecmp(string1, string2);
  }
  if (umax > 0) { return strncmp(string1, string2, umax); }
  return strcmp(string1, string2);
} /* Ende utf8_strncmp */
