Unicode の East Asian Width による文字幅チェック

キャラクタ端末で文字の表示幅を求めるには Unicode の文字プロパティ East Asian Width をチェックして、プロパティが F と W は全角、A は使っているフォント次第で全角か半角、それ以外は半角とします。

http://www.unicode.org/reports/tr11/

Unicode Character Database の EastAsianWidth.txt をダウンロードして、F か W、A、それ以外のコードポイントの範囲の登録数を比較してみると、F か W、A の 2 つの場合の方が圧倒的に数が少ないので、それらを Perl で抽出して C 言語のための索引を作ることにします。索引は、C 言語のルーチンから扱いやすくするために、すべて範囲の開始コードポイントと終了コードポイントの組の並びにしておきます。さらに、Perl で単純に索引を作って生成して眺めてみたところ、範囲が隣接している箇所が多いため、隣接している範囲をすべてまとめることにします。

#!/bin/env perl
use strict;
use warnings;

our $USAGE = 'perl eastasian.pl ucd/EastAsianWidth.txt';

# pickup W F and A codepoint from Unicode ucd/EastAsianWidth.txt

sub combine {
    my($table) = @_;
    my @a;
    for my $r (@{$table}) {
        if (@a && $a[-1][1] + 1 == $r->[0]) {
            $a[-1][1] = $r->[1];
        }
        else {
            push @a, $r;
        }
    }
    @{$table} = @a;
}

my @full_width;
my @ambiguous_width;
while (<>) {
    if (/^([0-9A-F]+)(?:[.][.]([0-9A-F]+))?[;]([WFA])/) {
        my($first, $last, $ea) = (hex($1), hex(defined $2 ? $2 : $1), $3);
        if ($ea eq 'A') {
            push @ambiguous_width, [$first, $last];
        }
        else {
            push @full_width, [$first, $last];
        }
    }
}
combine (\@full_width);
combine (\@ambiguous_width);

printf "enum {EASTASIAN_FULL_COUNT = %d, EASTASIAN_AMBIGUOUS_COUNT = %d};\n",
       (scalar @full_width), (scalar @ambiguous_width);
print "static int32_t const EASTASIAN_FULL[] = {\n";
for my $i (0 .. $#full_width) {
    print "    " if $i % 4 == 0;
    printf "0x%05x,0x%05x", $full_width[$i][0], $full_width[$i][1];
    print ",", ($i % 4 == 3 ? "\n" : " ") if $i < $#full_width;
}
print "\n";
print "};\n";
print "static int32_t const EASTASIAN_AMBIGUOUS[] = {\n";
for my $i (0 .. $#ambiguous_width) {
    print "    " if $i % 4 == 0;
    printf "0x%05x,0x%05x", $ambiguous_width[$i][0], $ambiguous_width[$i][1];
    print ",", ($i % 4 == 3 ? "\n" : " ") if $i < $#ambiguous_width;
}
print "\n";
print "};\n";

これで生成した範囲には重なりはなく、昇順で並んでいます。なので、あるコードポイントがテーブルに登録してあるかどうかを調べるには、範囲の開始コードポントで二分探索し、見つかった範囲内に探したいコードポイントが含まれているかどうかをチェックすることにします。

/* Unicode 7.0.0 East Asian Width
 * see http://www.unicode.org/reports/tr11/
 *
 * EASTASIAN_FULL has class F and W
 * EASTASIAN_AMBIGUOUS has class A
 */
#include <stdint.h>
#include <stdbool.h>

enum {EASTASIAN_FULL_COUNT = 35, EASTASIAN_AMBIGUOUS_COUNT = 173};
static int32_t const EASTASIAN_FULL[] = {
    0x01100,0x0115f, 0x02329,0x0232a, 0x02e80,0x02e99, 0x02e9b,0x02ef3,
    0x02f00,0x02fd5, 0x02ff0,0x02ffb, 0x03000,0x0303e, 0x03041,0x03096,
    0x03099,0x030ff, 0x03105,0x0312d, 0x03131,0x0318e, 0x03190,0x031ba,
    0x031c0,0x031e3, 0x031f0,0x0321e, 0x03220,0x03247, 0x03250,0x032fe,
    0x03300,0x04dbf, 0x04e00,0x0a48c, 0x0a490,0x0a4c6, 0x0a960,0x0a97c,
    0x0ac00,0x0d7a3, 0x0f900,0x0faff, 0x0fe10,0x0fe19, 0x0fe30,0x0fe52,
    0x0fe54,0x0fe66, 0x0fe68,0x0fe6b, 0x0ff01,0x0ff60, 0x0ffe0,0x0ffe6,
    0x1b000,0x1b001, 0x1f200,0x1f202, 0x1f210,0x1f23a, 0x1f240,0x1f248,
    0x1f250,0x1f251, 0x20000,0x2fffd, 0x30000,0x3fffd
};
static int32_t const EASTASIAN_AMBIGUOUS[] = {
    0x000a1,0x000a1, 0x000a4,0x000a4, 0x000a7,0x000a8, 0x000aa,0x000aa,
    0x000ad,0x000ae, 0x000b0,0x000b4, 0x000b6,0x000ba, 0x000bc,0x000bf,
    0x000c6,0x000c6, 0x000d0,0x000d0, 0x000d7,0x000d8, 0x000de,0x000e1,
    0x000e6,0x000e6, 0x000e8,0x000ea, 0x000ec,0x000ed, 0x000f0,0x000f0,
    0x000f2,0x000f3, 0x000f7,0x000fa, 0x000fc,0x000fc, 0x000fe,0x000fe,
    0x00101,0x00101, 0x00111,0x00111, 0x00113,0x00113, 0x0011b,0x0011b,
    0x00126,0x00127, 0x0012b,0x0012b, 0x00131,0x00133, 0x00138,0x00138,
    0x0013f,0x00142, 0x00144,0x00144, 0x00148,0x0014b, 0x0014d,0x0014d,
    0x00152,0x00153, 0x00166,0x00167, 0x0016b,0x0016b, 0x001ce,0x001ce,
    0x001d0,0x001d0, 0x001d2,0x001d2, 0x001d4,0x001d4, 0x001d6,0x001d6,
    0x001d8,0x001d8, 0x001da,0x001da, 0x001dc,0x001dc, 0x00251,0x00251,
    0x00261,0x00261, 0x002c4,0x002c4, 0x002c7,0x002c7, 0x002c9,0x002cb,
    0x002cd,0x002cd, 0x002d0,0x002d0, 0x002d8,0x002db, 0x002dd,0x002dd,
    0x002df,0x002df, 0x00300,0x0036f, 0x00391,0x003a1, 0x003a3,0x003a9,
    0x003b1,0x003c1, 0x003c3,0x003c9, 0x00401,0x00401, 0x00410,0x0044f,
    0x00451,0x00451, 0x02010,0x02010, 0x02013,0x02016, 0x02018,0x02019,
    0x0201c,0x0201d, 0x02020,0x02022, 0x02024,0x02027, 0x02030,0x02030,
    0x02032,0x02033, 0x02035,0x02035, 0x0203b,0x0203b, 0x0203e,0x0203e,
    0x02074,0x02074, 0x0207f,0x0207f, 0x02081,0x02084, 0x020ac,0x020ac,
    0x02103,0x02103, 0x02105,0x02105, 0x02109,0x02109, 0x02113,0x02113,
    0x02116,0x02116, 0x02121,0x02122, 0x02126,0x02126, 0x0212b,0x0212b,
    0x02153,0x02154, 0x0215b,0x0215e, 0x02160,0x0216b, 0x02170,0x02179,
    0x02189,0x02189, 0x02190,0x02199, 0x021b8,0x021b9, 0x021d2,0x021d2,
    0x021d4,0x021d4, 0x021e7,0x021e7, 0x02200,0x02200, 0x02202,0x02203,
    0x02207,0x02208, 0x0220b,0x0220b, 0x0220f,0x0220f, 0x02211,0x02211,
    0x02215,0x02215, 0x0221a,0x0221a, 0x0221d,0x02220, 0x02223,0x02223,
    0x02225,0x02225, 0x02227,0x0222c, 0x0222e,0x0222e, 0x02234,0x02237,
    0x0223c,0x0223d, 0x02248,0x02248, 0x0224c,0x0224c, 0x02252,0x02252,
    0x02260,0x02261, 0x02264,0x02267, 0x0226a,0x0226b, 0x0226e,0x0226f,
    0x02282,0x02283, 0x02286,0x02287, 0x02295,0x02295, 0x02299,0x02299,
    0x022a5,0x022a5, 0x022bf,0x022bf, 0x02312,0x02312, 0x02460,0x024e9,
    0x024eb,0x0254b, 0x02550,0x02573, 0x02580,0x0258f, 0x02592,0x02595,
    0x025a0,0x025a1, 0x025a3,0x025a9, 0x025b2,0x025b3, 0x025b6,0x025b7,
    0x025bc,0x025bd, 0x025c0,0x025c1, 0x025c6,0x025c8, 0x025cb,0x025cb,
    0x025ce,0x025d1, 0x025e2,0x025e5, 0x025ef,0x025ef, 0x02605,0x02606,
    0x02609,0x02609, 0x0260e,0x0260f, 0x02614,0x02615, 0x0261c,0x0261c,
    0x0261e,0x0261e, 0x02640,0x02640, 0x02642,0x02642, 0x02660,0x02661,
    0x02663,0x02665, 0x02667,0x0266a, 0x0266c,0x0266d, 0x0266f,0x0266f,
    0x0269e,0x0269f, 0x026be,0x026bf, 0x026c4,0x026cd, 0x026cf,0x026e1,
    0x026e3,0x026e3, 0x026e8,0x026ff, 0x0273d,0x0273d, 0x02757,0x02757,
    0x02776,0x0277f, 0x02b55,0x02b59, 0x03248,0x0324f, 0x0e000,0x0f8ff,
    0x0fe00,0x0fe0f, 0x0fffd,0x0fffd, 0x1f100,0x1f10a, 0x1f110,0x1f12d,
    0x1f130,0x1f169, 0x1f170,0x1f19a, 0xe0100,0xe01ef, 0xf0000,0xffffd,
    0x100000,0x10fffd
};

static bool
search_table (int32_t const * const table, int const count, int32_t const c)
{
    if (c < table[0])
        return false;
    int b = 0;
    int d = count;
    while (b < d) {
        int const u = b + (d - b) / 2;
        int const x = table[u * 2];
        if (c < x)
            d = u;
        else if (c > x)
            b = u + 1;
        else
            return true;
    }
    return b > 0 && c >= table[b * 2 - 2] && c <= table[b * 2 - 1];
}

bool
isfullwidth (int32_t const c, bool const ambiguous_is_full)
{
    if (search_table (EASTASIAN_FULL, EASTASIAN_FULL_COUNT, c))
        return true;
    if (ambiguous_is_full)
        return search_table (EASTASIAN_AMBIGUOUS, EASTASIAN_AMBIGUOUS_COUNT, c);
    return false;
}