Last data update: 2014.03.03
R: Conversion Tables between Character Sets
Conversion Tables between Character Sets
Description
charset_to_Unicode
is a matrix of Unicode code points with
columns for the common 8-bit encodings.
Adobe_glyphs
is a data frame which gives Adobe glyph names for
Unicode code points. It has two character columns, "adobe"
and
"unicode"
(a 4-digit hex representation).
Usage
charset_to_Unicode
Adobe_glyphs
Details
charset_to_Unicode
is an integer matrix of class
c("noquote", "hexmode")
so prints in hexadecimal.
The mappings are those used by libiconv
: there are differences
in the way quotes and minus/hyphen are mapped between sources (and the
postscript encoding files use a different mapping).
Adobe_glyphs
includes all the Adobe glyph names which correspond
to single Unicode characters. It is sorted by Unicode code point and
within a point alphabetically on the glyph (there can be more than one
name for a Unicode code point). The data are in the file
‘R_HOME /share/encodings/Adobe_glyphlist ’.
Source
https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt
Examples
## find Adobe names for ISOLatin2 chars.
latin2 <- charset_to_Unicode[, "ISOLatin2"]
aUnicode <- as.numeric(paste0("0x", Adobe_glyphs$unicode))
keep <- aUnicode %in% latin2
aUnicode <- aUnicode[keep]
aAdobe <- Adobe_glyphs[keep, 1]
## first match
aLatin2 <- aAdobe[match(latin2, aUnicode)]
## all matches
bLatin2 <- lapply(1:256, function(x) aAdobe[aUnicode == latin2[x]])
format(bLatin2, justify = "none")
Results
R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(tools)
> png(filename="/home/ddbj/snapshot/RGM3/R_rel/result/tools/charsets.Rd_%03d_medium.png", width=480, height=480)
> ### Name: charsets
> ### Title: Conversion Tables between Character Sets
> ### Aliases: Adobe_glyphs charset_to_Unicode
> ### Keywords: datasets
>
> ### ** Examples
>
> ## find Adobe names for ISOLatin2 chars.
> latin2 <- charset_to_Unicode[, "ISOLatin2"]
> aUnicode <- as.numeric(paste0("0x", Adobe_glyphs$unicode))
> keep <- aUnicode %in% latin2
> aUnicode <- aUnicode[keep]
> aAdobe <- Adobe_glyphs[keep, 1]
> ## first match
> aLatin2 <- aAdobe[match(latin2, aUnicode)]
> ## all matches
> bLatin2 <- lapply(1:256, function(x) aAdobe[aUnicode == latin2[x]])
> format(bLatin2, justify = "none")
[1] "" "controlSTX"
[3] "controlSOT" "controlETX"
[5] "controlEOT" "controlENQ"
[7] "controlACK" "controlBEL"
[9] "controlBS" "controlHT"
[11] "controlLF" "controlVT"
[13] "controlFF" "controlCR"
[15] "controlSO" "controlSI"
[17] "controlDLE" "controlDC1"
[19] "controlDC2" "controlDC3"
[21] "controlDC4" "controlNAK"
[23] "controlSYN" "controlETB"
[25] "controlCAN" "controlEM"
[27] "controlSUB" "controlESC"
[29] "controlFS" "controlGS"
[31] "controlRS" "controlUS"
[33] "space, spacehackarabic" "exclam"
[35] "quotedbl" "numbersign"
[37] "dollar" "percent"
[39] "ampersand" "quotesingle"
[41] "parenleft" "parenright"
[43] "asterisk" "plus"
[45] "comma" "hyphen"
[47] "period" "slash"
[49] "zero" "one"
[51] "two" "three"
[53] "four" "five"
[55] "six" "seven"
[57] "eight" "nine"
[59] "colon" "semicolon"
[61] "less" "equal"
[63] "greater" "question"
[65] "at" "A"
[67] "B" "C"
[69] "D" "E"
[71] "F" "G"
[73] "H" "I"
[75] "J" "K"
[77] "L" "M"
[79] "N" "O"
[81] "P" "Q"
[83] "R" "S"
[85] "T" "U"
[87] "V" "W"
[89] "X" "Y"
[91] "Z" "bracketleft"
[93] "backslash" "bracketright"
[95] "asciicircum" "underscore"
[97] "grave" "a"
[99] "b" "c"
[101] "d" "e"
[103] "f" "g"
[105] "h" "i"
[107] "j" "k"
[109] "l" "m"
[111] "n" "o"
[113] "p" "q"
[115] "r" "s"
[117] "t" "u"
[119] "v" "w"
[121] "x" "y"
[123] "z" "braceleft"
[125] "bar, verticalbar" "braceright"
[127] "asciitilde" "controlDEL"
[129] "" ""
[131] "" ""
[133] "" ""
[135] "" ""
[137] "" ""
[139] "" ""
[141] "" ""
[143] "" ""
[145] "" ""
[147] "" ""
[149] "" ""
[151] "" ""
[153] "" ""
[155] "" ""
[157] "" ""
[159] "" ""
[161] "nbspace, nonbreakingspace" "Aogonek"
[163] "breve" "Lslash"
[165] "currency" "Lcaron"
[167] "Sacute" "section"
[169] "dieresis" "Scaron"
[171] "Scedilla" "Tcaron"
[173] "Zacute" "sfthyphen, softhyphen"
[175] "Zcaron" "Zdot, Zdotaccent"
[177] "degree" "aogonek"
[179] "ogonek" "lslash"
[181] "acute" "lcaron"
[183] "sacute" "caron"
[185] "cedilla" "scaron"
[187] "scedilla" "tcaron"
[189] "zacute" "hungarumlaut"
[191] "zcaron" "zdot, zdotaccent"
[193] "Racute" "Aacute"
[195] "Acircumflex" "Abreve"
[197] "Adieresis" "Lacute"
[199] "Cacute" "Ccedilla"
[201] "Ccaron" "Eacute"
[203] "Eogonek" "Edieresis"
[205] "Ecaron" "Iacute"
[207] "Icircumflex" "Dcaron"
[209] "Dcroat, Dslash" "Nacute"
[211] "Ncaron" "Oacute"
[213] "Ocircumflex" "Odblacute, Ohungarumlaut"
[215] "Odieresis" "multiply"
[217] "Rcaron" "Uring"
[219] "Uacute" "Udblacute, Uhungarumlaut"
[221] "Udieresis" "Yacute"
[223] "Tcedilla, Tcommaaccent" "germandbls"
[225] "racute" "aacute"
[227] "acircumflex" "abreve"
[229] "adieresis" "lacute"
[231] "cacute" "ccedilla"
[233] "ccaron" "eacute"
[235] "eogonek" "edieresis"
[237] "ecaron" "iacute"
[239] "icircumflex" "dcaron"
[241] "dcroat, dmacron" "nacute"
[243] "ncaron" "oacute"
[245] "ocircumflex" "odblacute, ohungarumlaut"
[247] "odieresis" "divide"
[249] "rcaron" "uring"
[251] "uacute" "udblacute, uhungarumlaut"
[253] "udieresis" "yacute"
[255] "tcedilla, tcommaaccent" "dotaccent"
>
>
>
>
>
> dev.off()
null device
1
>