110 lines
3.3 KiB
Bash
Executable File
110 lines
3.3 KiB
Bash
Executable File
#!/bin/sh
|
|
# Copyright 2014 The Chromium Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
# References:
|
|
# https://encoding.spec.whatwg.org/#euc-jp
|
|
# https://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
|
|
# https://www.iana.org/assignments/charset-reg/CP51932
|
|
# Table 3-64 in CJKV Information Processing 2/e.
|
|
|
|
# Download the following two files, run it in source/data/mappings directory
|
|
# and save the result to euc-jp-html5.ucm
|
|
# https://encoding.spec.whatwg.org/index-jis0208.txt
|
|
# https://encoding.spec.whatwg.org/index-jis0212.txt
|
|
|
|
function preamble {
|
|
cat <<PREAMBLE
|
|
# ***************************************************************************
|
|
# *
|
|
# * Copyright (C) 1995-2014, International Business Machines
|
|
# * Corporation and others. All Rights Reserved.
|
|
# *
|
|
# * Generated per the algorithm for EUC-JP
|
|
# * described at https://encoding.spec.whatwg.org/#euc-jp.
|
|
# *
|
|
# ***************************************************************************
|
|
<code_set_name> "euc-jp-html"
|
|
<char_name_mask> "AXXXX"
|
|
<mb_cur_max> 3
|
|
<mb_cur_min> 1
|
|
<uconv_class> "MBCS"
|
|
<subchar> \x3F
|
|
<icu:charsetFamily> "ASCII"
|
|
|
|
<icu:state> 0-7f, 8e:2, 8f:3, a1-fe:1
|
|
<icu:state> a1-fe
|
|
<icu:state> a1-df
|
|
<icu:state> a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4
|
|
<icu:state> a1-fe.u
|
|
|
|
CHARMAP
|
|
PREAMBLE
|
|
}
|
|
|
|
#<U0000> \x00 |0
|
|
function ascii {
|
|
for i in $(seq 0 127)
|
|
do
|
|
printf '<U%04X> \\x%02X |0\n' $i $i
|
|
done
|
|
}
|
|
|
|
|
|
# Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F
|
|
function half_width_kana {
|
|
for i in $(seq 0xA1 0xDF)
|
|
do
|
|
# 65377 = 0xFF61, 161 = 0xA1
|
|
printf '<U%04X> \\x8E\\x%02X |0\n' $(($i + 65377 - 161)) $i
|
|
done
|
|
}
|
|
|
|
|
|
# index-jis0208.txt has index pointers larger than the size of
|
|
# the encoding space available in 2-byte Graphic plane of ISO-2022-based
|
|
# encoding (94 x 94 = 8836). We have to exclude them because they're for
|
|
# Shift-JIS.
|
|
# In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries.
|
|
# All the bi-directional mapping entries come *before* the uni-directional
|
|
# (EUC-JP to Unicode) entries so that we put '|3' if we have seen
|
|
# the same Unicode code point earlier in the list. According to the definition
|
|
# of 'index pointer' in the W3C encoding spec, it's the first entry in the
|
|
# file for a given Unicode code point.
|
|
|
|
function jis208 {
|
|
awk '!/^#/ && !/^$/ && $1 <= 8836 \
|
|
{ printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
|
|
$1 / 94 + 0xA1, $1 % 94 + 0xA1,\
|
|
($2 in uset) ? 3 : 0); \
|
|
uset[$2] = 1;
|
|
}' \
|
|
index-jis0208.txt
|
|
}
|
|
|
|
# JIS X 212 is for decoding only (use '|3' to denote that).
|
|
|
|
function jis212 {
|
|
awk '!/^#/ && !/^$/ \
|
|
{ printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\
|
|
$1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \
|
|
index-jis0212.txt
|
|
}
|
|
|
|
function unsorted_table {
|
|
ascii
|
|
half_width_kana
|
|
jis208
|
|
jis212
|
|
echo '<U00A5> \x5C |1'
|
|
echo '<U203E> \x7E |1'
|
|
echo '<U2212> \xA1\xDD |1'
|
|
}
|
|
|
|
wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt
|
|
wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0212.txt
|
|
preamble
|
|
unsorted_table | sort | uniq
|
|
echo 'END CHARMAP'
|