14. More involved processing and IO

14.1. Text processing.

다음의 코드는 숫자들이 있는 라인을 입력받아 콘솔에 출력한다.

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>

size_t* numberline(size_t size, char const lbuf[restrict size],
                   size_t*restrict np, int base);

char* fgetline(size_t size, char s[restrict size],
               FILE*restrict stream);

#include <limits.h>
#include <errno.h>
#ifndef EFAULT
# define EFAULT EDOM
#endif
#ifndef EOVERFLOW
# define EOVERFLOW (EFAULT-EOF)
# if EOVERFLOW > INT_MAX
#  error EOVERFLOW constant is too large
# endif
#endif
#ifndef ENOMEM
# define ENOMEM (EOVERFLOW+EFAULT-EOF)
# if ENOMEM > INT_MAX
#  error ENOMEM constant is too large
# endif
#endif

int fprintnumbers(FILE*restrict stream,
                  char const form[restrict static 1],
                  char const sep[restrict static 1],
                  size_t len, size_t numb[restrict len]);

int sprintnumbers(size_t tot, char buf[restrict tot],
                  char const form[restrict static 1],
                  char const sep[restrict static 1],
                  size_t len, size_t nums[restrict len]);

static
size_t numberline_inner(char const*restrict act,
                        size_t numb[restrict], int base){
  size_t n = 0;
  for (char* next = 0; act[0]; act = next) {
    numb[n] = strtoull(act, &next, base);
    if (act == next) break;
    ++n;
  }
  return n;
}

size_t* numberline(size_t size, char const lbuf[restrict size],
                   size_t*restrict np, int base){
  size_t* ret = 0;
  size_t n = 0;
  if (memchr(lbuf, 0, size)) {
    ret = malloc(sizeof(size_t[1+(2*size)/3]));
    n = numberline_inner(lbuf, ret, base);
    size_t len = n ? n : 1;
    ret = realloc(ret, sizeof(size_t[len]));
  }
  if (np) *np = n;
  return ret;
}

char* fgetline(size_t size, char s[restrict size],
               FILE*restrict stream){
  s[0] = 0;
  char* ret = fgets(s, size, stream);
  if (ret) {
    char* pos = strchr(s, '\n');
    if (pos) *pos = 0;
    else ret = 0;
  }
  return ret;
}

static inline int error_cleanup(int err, int prev) {
  errno = prev;
  return -err;
}

int sprintnumbers(size_t tot, char buf[restrict tot],
                  char const form[restrict static 1],
                  char const sep[restrict static 1],
                  size_t len, size_t nums[restrict len]) {
  char* p = buf;   /* next position in buf */
  size_t const seplen = strlen(sep);
  if (len) {
    size_t i = 0;
    for (;;) {
      p += sprintf(p, form, nums[i]);
      ++i;
      if (i >= len) break;
      memcpy(p, sep, seplen);
      p += seplen;
    }
  }
  memcpy(p, "\n", 2);
  return (p-buf)+1;
}

int fprintnumbers(FILE*restrict stream,
                  char const form[restrict static 1],
                  char const sep[restrict static 1],
                  size_t len, size_t nums[restrict len]) {
  if (!stream)       return -EFAULT;
  if (len && !nums)  return -EFAULT;
  if (len > INT_MAX) return -EOVERFLOW;

  size_t tot = (len ? len : 1)*strlen(sep);
  int err = errno;
  char* buf = 0;

  if (len) {
    for (size_t i = 0; i < len; ++i)
      tot += snprintf(0, 0, form, nums[i]);
    if (tot > INT_MAX) return error_cleanup(EOVERFLOW, err);
  }

  buf = malloc(tot+1);
  if (!buf) return error_cleanup(ENOMEM, err);

  sprintnumbers(tot, buf, form, sep, len, nums);
  if (fputs(buf, stream) == EOF) tot = EOF;
  free(buf);
  return tot;
}

int fprintnumbers_opt(FILE*restrict stream,
                  char const form[restrict static 1],
                  char const sep[restrict static 1],
                  size_t len, size_t nums[restrict len]) {
  if (!stream)       return -EFAULT;
  if (len && !nums)  return -EFAULT;
  if (len > INT_MAX) return -EOVERFLOW;

  int err = errno;
  size_t const seplen = strlen(sep);

  size_t tot = 0;
  size_t mtot = len*(seplen+10);
  char* buf = malloc(mtot);

  if (!buf) return error_cleanup(ENOMEM, err);

  for (size_t i = 0; i < len; ++i) {
    tot += sprintf(&buf[tot], form, nums[i]);
    ++i;
    if (i >= len) break;
    if (tot > mtot-20) {
      mtot *= 2;
      char* nbuf = realloc(buf, mtot);
      if (buf) {
        buf = nbuf;
      } else {
        tot = error_cleanup(ENOMEM, err);
        goto CLEANUP;
      }
    }
    memcpy(&buf[tot], sep, seplen);
    tot += seplen;
    if (tot > INT_MAX) {
      tot = error_cleanup(EOVERFLOW, err);
      goto CLEANUP;
    }
  }
  buf[tot] = 0;

  if (fputs(buf, stream) == EOF) tot = EOF;
 CLEANUP:
  free(buf);
  return tot;
}

int main(void) {
  char lbuf[256];
  for (;;) {
    if (fgetline(sizeof lbuf, lbuf, stdin)) {
      size_t n;
      size_t* nums = numberline(strlen(lbuf)+1, lbuf, &n, 0);
      int ret = fprintnumbers(stdout, "%#zX", ",\t", n, nums);
      if (ret < 0) return EXIT_FAILURE;
      free(nums);
    } else {
      if (lbuf[0]) {
        for (;;) {
          int c = getc(stdin);
          if (c == EOF) return EXIT_FAILURE;
          if (c == '\n') {
            fprintf(stderr, "line too long: %s\n", lbuf);
            break;
          }
        }
      } else break;
    }
  }
}

fgetline은 텍스트 라인을 받고, numberline은 해당 라인을 size_t 타입의 숫자들로 쪼개고, fprintnumbers는 그를 프린트한다. numberline은 numberline_inner 함수의 래퍼인데, 이 함수는 C 표준 라이브러리의 strtoull 함수를 사용한다.

Takeaway 2.14.1.1. strtoull 등의 문자열을 변환시키는 함수는 const로 보호하지 않는다.

Takeaway 2.14.1.2. memchr과 strchr 탐색 함수는 인자를 const로 보호하지 않는다.

Takeaway 2.14.1.3. strspn과 strcspn 탐색 함수는 인자를 const로 보호한다.

그러나 char 배열이 문자열인지 체크하는 데 쓸 수는 없다. fprintnumbers에서는 sprintnumbers를 호출하는데, 여기서 호출하는 C 표준 라이브러리의 sprintf 함수는 큰 문제가 있다.

Takeaway 2.14.1.4. sprintf는 버퍼 오버플로우에 대한 대처가 없다.

sprintf를 호출할 때 버퍼 크기가 충분한지는 프로그래머가 보장해야 한다. 대신에 snprintf를 쓸 수 있다.

Takeaway 2.14.1.5. 길이를 모르는 출력을 서식화할 때는 snprintf를 이용하라.

14.2. Formatted input.

C 표준 라이브러리에서는 서식화된 입력에 대해 임의의 스트림에서는 fscanf, 표준 입력 스트림에서는 scanf, 문자열 입력은 sscanf 함수를 제공한다. 이 서식은 printf류의 출력 함수들보다 조금 더 어려운데 그 이유들은 다음과 같다.

  • 모든 포맷에 대한 입력을 지원하기 위해, 인자들이 포인터 타입이다.
  • 공백 문자 입력 처리가 비직관적이다.
  • 문자열 처리가 다르다. %c는 문자 하나를, %s는 공백 없는 문자열을 읽어 0을 붙인다.
  • printf과 부동 소수점 처리 포맷이 다르다. %lg와 %Lg는 호환되므로 이를 사용하자.
  • 문자 클래스를 인식하는 것이 가능하다. %[aeiouAEIOU]는 모음을 스캔한다.

입력 패턴을 일관적으로 사용하도록 하자.

14.3. Extended character sets.

기본 아스키 문자 세트뿐만 아니라 멀티바이트 문자 세트를 이용해 확장된 문자 세트도 지원하도록 할 수 있다. 이를 위해서는 먼저 setlocale로 로케일을 적절히 초기화시켜주는 것이 중요하다.

Takeaway 2.14.3.1. 멀티바이트 문자는 널 바이트를 포함하지 않는다.

Takeaway 2.14.3.2. 멀티바이트 문자열은 널 문자로 끝난다.

아래의 코드는 멀티바이트 문자열을 지원하는 각종 헬퍼 함수들이다.

// mbstrings.h
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <wctype.h>
#include <ctype.h>

/**
 ** @file
 ** @brief Helper functions for multibyte strings.
 **
 ** These functions try to ease the use of multibyte strings. Most of
 ** the time they pass through wide character functions to achieve
 ** their goal. This has only use that is somewhat limited if on
 ** platform wide characters are not of fixed width but use
 ** surrogates. For such surrogate characters the classification
 ** functions will fail erratically.
 **
 ** @warning this implementations supposes that @c wchar_t supports
 ** some form of Unicode.
 **/

#ifndef __STDC_ISO_10646__
# error "wchar_t wide characters have to be Unicode code points"
#endif
#ifdef __STDC_MB_MIGHT_NEQ_WC__
# error "basic character codes must agree on char and wchar_t"
#endif

/**
 ** @brief A type for 16 bit characters as provided by "uchar.h"
 **
 ** This is the base type of strings that are prefixed with @c u, such
 ** as in <code>u"string"</code>.
 **
 ** There are good chances that on most platforms this is the UCS-2
 ** encoding, thus an encoding that has surrogates.
 **
 ** @remark This should be provided by the header file, but the type
 ** is fixed as is given here, and C11 compilers should tolerate a
 ** repetition of a @c typedef, as long as it is consistent.
 **/
typedef uint_least16_t char16_t;

/**
 ** @brief A type for 32 bit characters as provided by "uchar.h"
 **
 ** This is the base type of strings that are prefixed with @c U, such
 ** as in <code>U"string"</code>.
 **
 ** There are good chances that on most platforms this is the UCS-4
 ** encoding, thus an encoding without surrogates.
 **
 ** @remark This should be provided by the header file, but the type
 ** is fixed as is given here, and C11 compilers should tolerate a
 ** repetition of a @c typedef, as long as it is consistent.
 **/
typedef uint_least32_t char32_t;

/**
 ** @brief A helper type to ensure proper initialization of @c
 ** mbstate_t variables
 **
 ** Not to be used directly
 ** @see MBSTATE
 **/
union mbstate_t_ {
  unsigned char bytes[sizeof(mbstate_t)];
  mbstate_t state;
} mbstate_t_;

/**
 ** @brief Provide a pointer to a properly initialized @c mbstate_t
 **
 ** This uses the union type to ensure that the state is initialized
 ** with all bytes set to @c 0.
 **/
#define MBSTATE (&(union mbstate_t_){ .bytes = { 0 }}.state)

/**
 ** @brief For functions that receive a wide character classification
 ** function as an argument.
 **/
typedef int (*wcclass_t)(wint_t);

/**
 ** @brief Interpret a sequence of bytes in @a c as mb character
 ** and return that as wide character through @a C
 **
 ** @return the length of the mb character or @c -1 if an
 ** encoding error occured.
 **
 ** This function can be integrated into a sequence of such
 ** searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions.
 **
 ** @remark @a state of @c 0 indicates that @a c can be scanned
 ** without considering any context.
 **/
size_t mbrtow(wchar_t*restrict C, char const c[restrict static 1],
              mbstate_t*restrict state);

/**
 ** @brief Interpret a sequence of bytes in @a c as mb character and
 ** return that as wide character.
 **
 ** @return the wide character converted to @c wint_t or @c WEOF if an
 ** encoding error occurred.
 **
 ** This function should only be used for strings containing one
 ** single mb character. It cannot be integrated into a sequence of
 ** searches through a string with more characters since the @c
 ** mbstate_t is not kept.
 **/
wint_t mbtow(char const*c);

/**
 ** @brief Interpret a mb string in @a mbs and return its
 ** length when interpreted as a wide character string
 **
 ** @return the length of the mb string or @c -1 if an
 ** encoding error occured.
 **
 ** This function can be integrated into a sequence of searches
 ** through a string, as long as a @a state argument is passed to
 ** this function that is consistent with the mb character
 ** starting in @a mbs. The state itself is not modified by this
 ** function.
 **
 ** @remark @a state of @c 0 indicates that @a mbs can be scanned
 ** without considering any context.
 **/
size_t mbsrlen(char const*restrict mbs,
               mbstate_t const*restrict state);

/**
 ** @brief Interpret a sequence of bytes in @a s as mb string and
 ** convert it to a wide character string.
 **
 ** @return a newly malloc'ed wide character string of the
 ** appropriate length, @c 0 if an encoding error occurred.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as a @a state
 ** argument is passed to this function that is consistent with
 ** the mb character starting in @a c. The state itself is not
 ** modified by this function.
 **
 ** @remark @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
wchar_t* mbsrdup(char const*s, mbstate_t const*restrict state);

/**
 ** @brief Copy at most @a n bytes of the mb string @a s to @a t.
 **
 ** This is meant to be an efficient variant of ::mbsrncpy for
 ** the case the copy operation can be done with ::memcpy.
 **
 ** Mb string @a s is supposed to be a complete mb string in
 ** initial shift state. If it has no @c 0 character in the first
 ** @a n bytes, the copy operation fails and the function returns
 ** @c 0.
 **
 ** The mb string that is produced in @a t may be the
 ** continuation of an existing prefix who's conversion state is
 ** described by @a state.
 **
 ** If @a s can not completely be copied because the adjustment
 ** of the shift state would leave less than
 ** <code>strlen(s)+1</code> space in @a t, the copy operation is
 ** aborted and @c 0 is returned.
 **
 ** @return the address of the terminating null character in @a t
 ** if the copy operation succeeded, @c 0 otherwise. In case of
 ** success, @a t is in an initial shift state for the returned
 ** position.
 **/
char* mbsrmbsncpy(size_t n, char t[restrict n], mbstate_t const*restrict state,
                  char const s [restrict static 1]);


/**
 ** @brief Copy at most @a n bytes of the mb string @a s to @a t.
 **
 ** This is a safe variant of ::mbsrmbsncpy that covers all
 ** cases.
 **
 ** Mb string @a s is supposed to be a mb string in state @a
 ** sstate. If it has no @c 0 character in the first @a n bytes,
 ** the copy operation will be partial up to that point.
 **
 ** The mb string that is produced in @a t may be the continuation of
 ** an existing prefix who's conversion state is described by @a
 ** sstate.
 **
 ** This function copies complete mb characters as long as they
 ** fit into the target array. To be able to do so, the
 ** conversion state of @a t might have to be adapted to be in
 ** sync with @a s. By that the copied part may be longer than
 ** <code>strlen(s)+1</code>.
 **
 ** @return The address of the first byte in @a t after the copy
 ** is returned; that byte is null if the whole @a s could be
 ** copied. If nothing could be copied, @a t is returned. In that
 ** case @c *t is not written to and @a tstate is unchanged.
 **/
char* mbsrncpy(size_t n, char t[restrict n], mbstate_t*restrict tstate,
               char const s [restrict static 1], mbstate_t const*restrict sstate);


/**
 ** @brief Interpret a sequence of bytes in @a s as mb string and
 ** search for wide character @a C
 **
 ** @return the @a occurrence'th position in @a s that starts a
 ** mb sequence corresponding to @a C or @c 0 if an encoding
 ** error occurred.
 **
 ** If the number of occurrences is less than @a occurrence the
 ** last such position is returned. So in particular using @c
 ** SIZE_MAX (or @c -1) will always return the last occurrence.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrwc(char const s[restrict static 1],
                   mbstate_t*restrict state,
                   wchar_t C, size_t occurrence);

/**
 ** @brief Interpret a sequence of bytes in @a s as mb string and
 ** search for mb character represented by @a c
 **
 ** @return the @a occurrence'th position in @a s that starts a
 ** mb sequence corresponding to @a c or @c 0 if an encoding
 ** error occurred.
 **
 ** If the number of occurrences is less than @a occurrence the
 ** last such position is returned. So in particular using @c
 ** SIZE_MAX (or @c -1) will always return the last occurrence.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrmb(char const s[static 1], mbstate_t*restrict state,
                   char const c[static 1], size_t occurence);

/**
 ** @brief Interpret a sequence of bytes in @a s as mb string and
 ** and reverse search for wide character @a C
 **
 ** @return the maximal position in @a s that starts a mb
 ** sequence corresponding to @a C or @c 0 if an encoding error
 ** occured.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrrwc(char const s[restrict static 1], mbstate_t*restrict state,
                    wchar_t C);

/**
 ** @brief Interpret a sequence of bytes in @a s as mb string and
 ** reverse search for mb character represented by @a c
 **
 ** @return the maximal position in @a s that starts a mb
 ** sequence corresponding to @a c or @c 0 if an encoding error
 ** occured.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrrmb(char const s[static 1], mbstate_t*restrict state,
                    char const c[static 1]);

/**
 ** @brief In mb string @a s1 jump over the initial segment
 ** corresponding to wide character string @a S2.
 **
 ** @return the position in @a s1 after the initial segment if @a
 ** S2 is found or @c 0 otherwise.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark A @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrwcjump(char const s1[static 1], mbstate_t*restrict state,
                       size_t S2len, wchar_t const S2[S2len]);

/**
 ** @brief In mb string @a s1 find a segment corresponding to
 ** wide character string @a S2.
 **
 ** @return the position in @a s1 that starts the first such
 ** segment if @a S2 is found or @c 0 otherwise.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark A @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrwcs(char const s1[static 1], mbstate_t*restrict state,
                    wchar_t const S2[static 1]);

/**
 ** @brief In mb string @a s1 find a segment corresponding to mb
 ** string @a s2.
 **
 ** @return the position in @a s1 that starts the first such
 ** segment if @a s2 is found or @c 0 otherwise.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark A @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrmbs(char const s1[static 1], mbstate_t*restrict state,
                    char const s2[static 1]);

/**
 ** @brief In mb string @a s1 skip over the initial part of the
 ** string that corresponds to wide characters in string @a S2.
 **
 ** @return the position in @a s1 that corresponds to the the
 ** first non-matching wide character in @a S2, which may be the
 ** terminating @c 0 character or a position that produces an
 ** encoding error.
 **
 ** @a state is updated to correspond to the returned position.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark A @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrwcsskip(char const s1[static 1], mbstate_t*restrict state,
                        wchar_t const* S2);

/**
 ** @brief In mb string @a s1 skip over the initial part of the
 ** string that corresponds to mb characters in string @a s2.
 **
 ** @return the position in @a s1 that corresponds to the the
 ** first non-matching mb character in @a s2, which may be the
 ** terminating @c 0 character or a position that produces an
 ** encoding error.
 **
 ** @a state is updated to correspond to the returned position.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark A @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrskip(char const s1[static 1], mbstate_t*restrict state,
                     char const* s2);

/**
 ** @brief In mb string @a s1 return the length of the initial
 ** part of the string that corresponds to mb characters in
 ** string @a s2.
 **
 ** @return the position in @a s1 that corresponds to the the
 ** first non-matching mb character in @a s2, which may be the
 ** terminating @c 0 character or a position that produces an
 ** encoding error.
 **
 ** @remark This function supposes that @a s1 can be scanned
 ** without context, that is it starts in an initial shift state.
 **/
size_t mbsspn(char const* s1, char const* s2);

/**
 ** @brief In mb string @a s1 skip over the initial part of the
 ** string that verifies classification @a func.
 **
 ** @return the position in @a s1 that corresponds to the the
 ** first non-matching mb character in @a s2, which may be the
 ** terminating @c 0 character or a position that produces an
 ** encoding error.
 **
 ** @a state is updated to correspond to the returned position.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark A @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrskip_class(char const s1[static 1], mbstate_t*restrict state,
                           wcclass_t func);

/**
 ** @brief In mb string @a s1 return the length of the initial
 ** part of the string that verifies classification @a func.
 **
 ** @return the position in @a s1 that corresponds to the the
 ** first non-matching mb character in @a s2, which may be the
 ** terminating @c 0 character or a position that produces an
 ** encoding error.
 **
 ** @remark This function supposes that @a s1 can be scanned
 ** without context, that is it starts in an initial shift state.
 **/
size_t mbsspn_class(char const* s1, wcclass_t func);

/**
 ** @brief In mb string @a s1 skip over the initial part of the
 ** string that verifies type @a type.
 **
 ** @return the position in @a s1 that corresponds to the the
 ** first non-matching mb character in @a s2, which may be the
 ** terminating @c 0 character or a position that produces an
 ** encoding error.
 **
 ** @a state is updated to correspond to the returned position.
 **
 ** @remark This function can be integrated into a sequence of
 ** such searches through a string, as long as the same @a state
 ** argument passed to all calls to this or similar functions and
 ** as long as the continuation of the search starts at the
 ** position that is returned by this function.
 **
 ** @remark A @a state of @c 0 indicates that @a s can be scanned
 ** without considering any context.
 **/
char const* mbsrskip_type(char const s1[static 1], mbstate_t*restrict state, wctype_t type);

/**
 ** @brief In mb string @a s1 return the length of the initial
 ** part of the string that verifies type @a type.
 **
 ** @return the position in @a s1 that corresponds to the the
 ** first non-matching mb character in @a s2, which may be the
 ** terminating @c 0 character or a position that produces an
 ** encoding error.
 **
 ** @remark This function supposes that @a s1 can be scanned
 ** without context, that is it starts in an initial shift state.
 **/
size_t mbsspn_type(char const* s1, wctype_t type);

/**
 ** @brief In mb string @a s1 return the length of the initial
 ** part of the string who's type corresponds to category @a
 ** name.
 **
 ** Predefined standard names are @c "alnum", @c "alpha", @c
 ** "blank", @c "cntrl", @c "digit", @c "graph", @c "lower", @c
 ** "print", @c "punct", @c "space", @c "upper", and @c
 ** "xdigit". Others may be provided by your platform.
 **
 ** @return the position in @a s1 that corresponds to the the
 ** first non-matching mb character in @a s2, which may be the
 ** terminating @c 0 character or a position that produces an
 ** encoding error.
 **
 ** @remark This function supposes that @a s1 can be scanned
 ** without context, that is it starts in an initial shift state.
 **/
size_t mbsspn_name(char const* s1, char const name[static 1]);

/**
 ** @brief In mb string @a s1 skip over the initial part of the
 ** string that doesn't correspond to wide characters in string
 ** @a S2.
 **
 ** @see mbsrwcsskip 
 **/
char const* mbsrwcscskip(char const s1[static 1], mbstate_t*restrict state, wchar_t const* S2);

/**
 ** @brief In mb string @a s1 skip over the initial part of the
 ** string that doesn't correspond to mb characters in string @a
 ** s2.
 **
 ** @see mbsrskip
 **/
char const* mbsrcskip(char const* s1, mbstate_t*restrict state, char const* s2);

/**
 ** @brief In mb string @a s1 skip over the initial part of the
 ** string that doesn't correspond to mb characters in string @a
 ** s2.
 **
 ** @see mbsrskip
 **/
size_t mbscspn(char const* s1, char const* s2);

/**
 ** @brief Convert mb string @a s1 to @c double.
 **
 ** @see strtod
 **/
double mbsrtod(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr);

/**
 ** @brief Convert mb string @a s1 to <code>long double</code>.
 **
 ** @see strtold
 **/
long double mbsrtold(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr);

/**
 ** @brief Convert mb string @a s1 to <code>long</code> according
 ** to base @a base.
 **
 ** @see strtol
 **/
long mbsrtol(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr, int base);

/**
 ** @brief Convert mb string @a s1 to <code>long long</code> according
 ** to base @a base.
 **
 ** @see strtoll
 **/
long long mbsrtoll(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr, int base);

/**
 ** @brief Convert mb string @a s1 to <code>unsigned long</code>
 ** according to base @a base.
 **
 ** @see strtoul
 **/
unsigned long mbsrtoul(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr, int base);

/**
 ** @brief Convert mb string @a s1 to <code>unsigned long
 ** long</code> according to base @a base.
 **
 ** @see strtoull
 **/
unsigned long long mbsrtoull(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr, int base);
// mbstrings.c
#include <errno.h>
#include <limits.h>
#include <locale.h>
#include <stdbool.h>
#include <stdio.h>

#include "mbstrings.h"

/**
 ** @file
 ** @brief implement multibyte character string helpers
 **/

size_t mbrtow(wchar_t*restrict C, char const c[restrict static 1],
              mbstate_t*restrict state) {
  if (!state) state = MBSTATE;
  size_t len = -2;
  for (size_t maxlen = MB_LEN_MAX; len == -2; maxlen *= 2)
    len = mbrtowc(C, c, maxlen, state);
  if (len == -1) errno = 0;
  return len;
}

wint_t mbtow(char const*c) {
  wchar_t C = 0;
  size_t len = mbrtow(&C, c, MBSTATE);
  return (len == -1) ? WEOF : C;
}

size_t mbsrlen(char const*s, mbstate_t const*restrict state) {
  if (!state) state = MBSTATE;
  mbstate_t st = *state;
  size_t mblen = mbsrtowcs(0, &s, 0, &st);
  if (mblen == -1) errno = 0;
  return mblen;
}

wchar_t* mbsrdup(char const*s, mbstate_t const*restrict state) {
  size_t mblen = mbsrlen(s, state);
  if (mblen == -1) return 0;
  mbstate_t st =  state ? *state : *MBSTATE;
  wchar_t* S = malloc(sizeof(wchar_t[mblen+1]));
  /* We know that s converts well, so no error check */
  if (S) mbsrtowcs(S, &s, mblen+1, &st);
  return S;
}

#define SURROG0 0xD800L
#define SURROG1 (SURROG0 + 1024)
#define SURROG2 (SURROG1 + 1024)

int iswhighsurrogate(wint_t x) {
  return (SURROG0 <= x) && (x < SURROG1);
}

int iswlowsurrogate(wint_t x) {
  return (SURROG1 <= x) && (x < SURROG2);
}

int iswsurrogate(wint_t x) {
  return (SURROG0 <= x) && (x < SURROG2);
}

int iswvalid(wint_t x) {
  return x && x != WEOF && !iswsurrogate(x);
}

char const* mbsrwc(char const s[restrict static 1], mbstate_t*restrict state,
                   wchar_t C, size_t occurrence) {
  if (!C || C == WEOF) return 0;
  if (!state) state = MBSTATE;
  char const* ret = 0;

  mbstate_t st = *state;
  for (size_t len = 0; s[0]; s += len) {
    mbstate_t backup = st;
    wchar_t S = 0;
    len = mbrtow(&S, s, &st);
    if (!S) break;
    if (C == S) {
      *state = backup;
      ret = s;
      if (!occurrence) break;
      --occurrence;
    }
  }
  return ret;
}

char const* mbsrmb(char const s[static 1], mbstate_t*restrict state,
                   char const c[static 1], size_t occurrence) {
  if (!state) state = MBSTATE;
  wint_t C = mbtow(c);
  return mbsrwc(s, state, C, occurrence);
}

char const* mbsrrwc(char const s[restrict static 1], mbstate_t*restrict state,
                    wchar_t C) {
  return mbsrwc(s, state, C, -1);
}

char const* mbsrrmb(char const s[static 1], mbstate_t*restrict state,
                    char const c[static 1]) {
  return mbsrmb(s, state, c, -1);
}

char const* mbsrwcjump(char const s1[static 1], mbstate_t*restrict state,
                       size_t S2len, wchar_t const S2[S2len]) {
  if (!state) state = MBSTATE;
  mbstate_t st = *state;
  for (size_t i = 0; i < S2len; ++i) {
    wchar_t S1 = 0;
    s1 += mbrtow(&S1, s1, &st);
    if (S1 != S2[i]) return 0;
  }
  *state = st;
  return s1;
}


char const* mbsrwcs(char const s1[static 1], mbstate_t*restrict state,
                    wchar_t const S2[static 1]) {
  if (!state) state = MBSTATE;
  size_t S2len = wcslen(S2);
  switch (S2len) {
  case 0: return 0;
  case 1: return mbsrwc(s1, state, S2[0], 0);
  default:;
    /* Don't modify shift state until we found it. */
    mbstate_t rstate = *state;
    while (s1 && s1[0]) {
      s1 = mbsrwc(s1, &rstate, S2[0], 0);
      if (s1 && s1[0]) {
        /* s1 now is at a potential starting point */
        char const* ret = s1;
        mbstate_t tstate = rstate;
        if (mbsrwcjump(s1, &tstate, S2len, S2)) {
          *state = rstate;
          return ret;
        }
        /* Advance s1 to the next mb character. */
        s1 += mbrtow(0, s1, &rstate);
      }
    }
  }
  return 0;
}

char const* mbsrmbs(char const s1[static 1], mbstate_t*restrict state,
                    char const s2[static 1]) {
  if (!state) state = MBSTATE;
  wchar_t*restrict S2 = mbsrdup(s2, 0);
  if (!S2 || !S2[0]) return 0;
  s1 = mbsrwcs(s1, state, S2);
  free(S2);
  return s1;
}

char const* mbsrwcsskip(char const s1[static 1], mbstate_t*restrict state,
                        wchar_t const* S2){
  if (S2) {
    if (!state) state = MBSTATE;
    mbstate_t st = *state;
    for (size_t len; s1[0]; *state = st, s1 += len) {
      wchar_t S1[3] = { 0 };
      len = mbrtow(&S1[0], s1, &st);
      if (!S1[0]) break;
      if (!iswlowsurrogate(S1[0])) {
        if (!wcschr(S2, S1[0])) break;
      } else {
        len += mbrtow(&S1[1], s1, &st);
        if (!wcsstr(S2, S1)) break;
      }
    }
    *state = st;
  }
  return s1;
}

char const* mbsrskip(char const s1[static 1], mbstate_t*restrict state,
                     char const* s2) {
  if (!state) state = MBSTATE;
  wchar_t*restrict S2 = mbsrdup(s2, 0);
  s1 = mbsrwcsskip(s1, state, S2);
  free(S2);
  return s1;
}

size_t mbsspn(char const* s1, char const* s2) {
  return s1 ? mbsrskip(s1, 0, s2)-s1 : 0;
}

char const* mbsrskip_class(char const s1[static 1], mbstate_t*restrict state,
                           wcclass_t func) {
  if (!state) state = MBSTATE;
  for (size_t len; s1[0]; s1 += len) {
    wchar_t S1 = 0;
    len = mbrtow(&S1, s1, state);
    if (!S1 || !func(S1)) break;
  }
  return s1;
}

size_t mbsspn_class(char const* s1, wcclass_t func) {
  return s1 ? mbsrskip_class(s1, 0, func)-s1 : 0;
}

char const* mbsrskip_type(char const s1[static 1], mbstate_t*restrict state, wctype_t type) {
  if (!state) state = MBSTATE;
  for (size_t len; s1[0]; s1 += len) {
    wchar_t S1 = 0;
    len = mbrtow(&S1, s1, state);
    if (!S1 || !iswctype(S1, type)) break;
  }
  return s1;
}

size_t mbsspn_type(char const* s1, wctype_t type) {
  return s1 ? mbsrskip_type(s1, 0, type)-s1 : 0;
}

size_t mbsspn_name(char const* s1, char const name[static 1]) {
  return mbsspn_type(s1, wctype(name));
}

char const* mbsrwcscskip(char const s1[static 1], mbstate_t*restrict state,
                         wchar_t const* S2) {
  if (!state) state = MBSTATE;
  if (S2) {
    mbstate_t st = *state;
    for (size_t len; s1[0]; s1 += len) {
      wchar_t S1[3] = { 0 };
      len = mbrtow(&S1[0], s1, &st);
      if (!S1[0]) break;
      if (!iswlowsurrogate(S1[0])) {
        if (wcschr(S2, S1[0])) break;
      } else {
        len += mbrtow(&S1[1], s1, &st);
        if (wcsstr(S2, S1)) break;
      }
    }
    *state = st;
  }
  return s1;
}

char const* mbsrcskip(char const* s1, mbstate_t*restrict state, char const* s2) {
  if (!state) state = MBSTATE;
  wchar_t*restrict S2 = mbsrdup(s2, 0);
  s1 = mbsrwcscskip(s1, state, S2);
  free(S2);
  return s1;
}

size_t mbscspn(char const* s1, char const* s2) {
  return (s1 && s2) ? mbsrcskip(s1, 0, s2)-s1 : 0;
}


double mbsrtod(char const*restrict s1, mbstate_t*restrict state,
              char**restrict endptr) {
  if (!state) state = MBSTATE;
  return strtod(mbsrskip_class(s1, state, iswspace), endptr);
}

long double mbsrtold(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr) {
  if (!state) state = MBSTATE;
  return strtold(mbsrskip_class(s1, state, iswspace), endptr);
}

long mbsrtol(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr, int base) {
  if (!state) state = MBSTATE;
  return strtol(mbsrskip_class(s1, state, iswspace), endptr, base);
}

long long mbsrtoll(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr, int base) {
  if (!state) state = MBSTATE;
  return strtoll(mbsrskip_class(s1, state, iswspace), endptr, base);
}

unsigned long mbsrtoul(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr, int base) {
  if (!state) state = MBSTATE;
  return strtoul(mbsrskip_class(s1, state, iswspace), endptr, base);
}

unsigned long long mbsrtoull(char const*restrict s1, mbstate_t*restrict state, char**restrict endptr, int base) {
  if (!state) state = MBSTATE;
  return strtoull(mbsrskip_class(s1, state, iswspace), endptr, base);
}



char* mbsrmbsncpy(size_t n, char t[restrict n], mbstate_t const*restrict state,
                  char const s [restrict static 1]) {
  /* Don't do anything if s wouldn't fit entirely into t */
  char const* ep = memchr(s, 0, n);
  if (!ep) return 0;
  size_t slen = ep - s;
  /* If the target is in non-initial state, try to reset it. */
  if (!mbsinit(state)) {
    /* Now state is known to be non-null. */
    char buf[2*MB_LEN_MAX];
    mbstate_t st = *state;
    size_t len = wcrtomb(buf, 0, &st);
    /* See if shift characters plus s will fit into t. */
    if (slen+len >= n) return 0;
    /* write the bytes that end the shift state */
    memcpy(t, buf, len);
    t += len-1;
    n -= len-1;
  }
  if (slen >= n) return 0;
  memcpy(t, s, slen+1);
  return t + slen;
}

char* mbsrncpy(size_t n, char t[restrict n], mbstate_t*restrict tstate,
               char const s [restrict static 1], mbstate_t const*restrict sstate) {
  /* First check if the bytes can just be copied over. */
  if (mbsinit(sstate)) {
    char* ret = mbsrmbsncpy(n, t, tstate, s);
    if (ret) return ret;
  }
  char* tt = t;
  if (!tstate) tstate = MBSTATE;
  if (!sstate) sstate = MBSTATE;
  mbstate_t sst = *sstate;
  char buf[2*MB_LEN_MAX];
  for (size_t slen = 0; s[0] && n; s += slen) {
    wchar_t S[2] = { 0 };
    slen = mbrtowc(S, s, n, &sst);
    /* Here, wcsrtombs can't fail, but because of the different state
       of t, tlen may be larger than slen. */
    wchar_t const* Sp = S;
    mbstate_t tst = *tstate;
    size_t tlen = wcsrtombs(buf, &Sp, n, &tst);
    if (tlen > n) break;
    *tstate = tst;
    memcpy(tt, buf, tlen);
    if (!S[0]) break;
    tt += tlen;
    n -= tlen;
  }
  return tt;
}

C11에서는 로케일에 의존하지 않은 문자 인코딩인 char16_t, char32_t 등도 지원한다.

14.4. Binary streams.

printf와 fputs의 텍스트 입출력은 타깃 파일이나 디바이스에 내용물과 입출력 내용이 일대일 대응이 되지 않는다.

  • 공백 문자의 인코딩이 플랫폼에 따라 다를 수 있다.
  • 개행 문자 앞의 공백 문자는 무시된다.
  • 멀티바이트 문자는 파일의 파일시스템이 기반한 플랫폼의 문자로 변환된다.

따라서 포터블한 파일 입출력 방법은 바이너리 스트림을 열거나 닫은 뒤 fgetc나 fputc를 쓰는 방법밖에 없다. 이를 더 편하게 쓰기 위해 C에서는 다음의 인터페이스를 제공한다.

size_t fread(void* restrict ptr, size_t size, size_t nmemb, FILE* restrict stream);
size_t fwrite(void const*restrict ptr, size_t size, size_t nmemb, FILE* restrict stream);
int fseek(FILE* stream, long int offset, int whence);
long int ftell(FILE* stream);

fread와 fwrite는 자연스러운 입출력 함수이다. ftell은 현재 파일에서의 스트림 위치를 리턴하고 fseek는 파일에서 스트림 위치를 변경시킨다.

Takeaway 2.14.4.1. fread나 fwrite로 스트림을 열 때는 바이너리 모드로 열어라.

Takeaway 2.14.4.2. 바이너리 파일로 쓰여진 파일들은 플랫폼간 호환 가능하지 않다.

fseek과 ftell은 LONG_MAX 오프셋 이상을 다룰 수 없다.

Takeaway 2.14.4.3. fseek과 ftell은 아주 큰 파일을 다루는 데 적합하지 않다.

14.5. Error checking and cleanup.

14.1에서 다룬 숫자를 받아 출력하는 파일에서는 플랫폼에 대한 사전조건은 전처리 조건문으로 체크하고, fprintnumbers의 처음 세 줄에서는 런타임에 에러 코드를 반환하도록 하여 처리하고 있다. 이는 error_cleanup에서 뒷처리된다. 또는 goto를 사용하여 최적화될 수도 있다.

Takeaway 2.14.5.1. goto 라벨은 그를 포함하는 함수 전체에서 접근 가능해야 한다.

Takeaway 2.14.5.2. goto는 같은 함수 내의 라벨로만 점프할 수 있다.

Takeaway 2.14.5.3. goto는 변수 초기화를 뛰어넘어서는 안 된다.

요점 정리

  • C 라이브러리는 텍스트 처리에 대한 인터페이스를 제공하지만 const 보호와 버퍼 오버플로우에 유의해야 한다.
  • scanf는 포인터 입력, 문자열의 널 끝맺음, 공백, 개행 문자 관련 여러 이슈가 있다. 가능하면 fgets나 strtod를 대신 사용하라.
  • 확장 문자 집합은 멀티바이트 문자열로 다룰 수 있다. 이는 일반 문자열을 다룰 때와 비슷한 방식으로 쓸 수 있다.
  • 바이너리 데이터는 fwrite와 fread로 바이너리 입출력을 해야 하고, 플랫폼간 호환이 되지 않는다.
  • C 표준 라이브러리 함수의 호출은 에러값 반환을 체크해야 한다.
  • 에러 처리는 복잡한 케이스별 분석이 필요하다. goto 점프로 이를 조직화할 수 있다.

답글 남기기

아래 항목을 채우거나 오른쪽 아이콘 중 하나를 클릭하여 로그 인 하세요:

WordPress.com 로고

WordPress.com의 계정을 사용하여 댓글을 남깁니다. 로그아웃 /  변경 )

Google photo

Google의 계정을 사용하여 댓글을 남깁니다. 로그아웃 /  변경 )

Twitter 사진

Twitter의 계정을 사용하여 댓글을 남깁니다. 로그아웃 /  변경 )

Facebook 사진

Facebook의 계정을 사용하여 댓글을 남깁니다. 로그아웃 /  변경 )

%s에 연결하는 중