Last data update: 2014.03.03

R: Conversion Tables between Character Sets
charsetsR Documentation

Conversion Tables between Character Sets

Description

charset_to_Unicode is a matrix of Unicode code points with columns for the common 8-bit encodings.

Adobe_glyphs is a data frame which gives Adobe glyph names for Unicode code points. It has two character columns, "adobe" and "unicode" (a 4-digit hex representation).

Usage

charset_to_Unicode

Adobe_glyphs

Details

charset_to_Unicode is an integer matrix of class c("noquote", "hexmode") so prints in hexadecimal. The mappings are those used by libiconv: there are differences in the way quotes and minus/hyphen are mapped between sources (and the postscript encoding files use a different mapping).

Adobe_glyphs includes all the Adobe glyph names which correspond to single Unicode characters. It is sorted by Unicode code point and within a point alphabetically on the glyph (there can be more than one name for a Unicode code point). The data are in the file ‘R_HOME/share/encodings/Adobe_glyphlist’.

Source

https://partners.adobe.com/public/developer/en/opentype/glyphlist.txt

Examples

## find Adobe names for ISOLatin2 chars.
latin2 <- charset_to_Unicode[, "ISOLatin2"]
aUnicode <- as.numeric(paste0("0x", Adobe_glyphs$unicode))
keep <- aUnicode %in% latin2
aUnicode <- aUnicode[keep]
aAdobe <- Adobe_glyphs[keep, 1]
## first match
aLatin2 <- aAdobe[match(latin2, aUnicode)]
## all matches
bLatin2 <- lapply(1:256, function(x) aAdobe[aUnicode == latin2[x]])
format(bLatin2, justify = "none")

Results


R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library(tools)
> png(filename="/home/ddbj/snapshot/RGM3/R_rel/result/tools/charsets.Rd_%03d_medium.png", width=480, height=480)
> ### Name: charsets
> ### Title: Conversion Tables between Character Sets
> ### Aliases: Adobe_glyphs charset_to_Unicode
> ### Keywords: datasets
> 
> ### ** Examples
> 
> ## find Adobe names for ISOLatin2 chars.
> latin2 <- charset_to_Unicode[, "ISOLatin2"]
> aUnicode <- as.numeric(paste0("0x", Adobe_glyphs$unicode))
> keep <- aUnicode %in% latin2
> aUnicode <- aUnicode[keep]
> aAdobe <- Adobe_glyphs[keep, 1]
> ## first match
> aLatin2 <- aAdobe[match(latin2, aUnicode)]
> ## all matches
> bLatin2 <- lapply(1:256, function(x) aAdobe[aUnicode == latin2[x]])
> format(bLatin2, justify = "none")
  [1] ""                          "controlSTX"               
  [3] "controlSOT"                "controlETX"               
  [5] "controlEOT"                "controlENQ"               
  [7] "controlACK"                "controlBEL"               
  [9] "controlBS"                 "controlHT"                
 [11] "controlLF"                 "controlVT"                
 [13] "controlFF"                 "controlCR"                
 [15] "controlSO"                 "controlSI"                
 [17] "controlDLE"                "controlDC1"               
 [19] "controlDC2"                "controlDC3"               
 [21] "controlDC4"                "controlNAK"               
 [23] "controlSYN"                "controlETB"               
 [25] "controlCAN"                "controlEM"                
 [27] "controlSUB"                "controlESC"               
 [29] "controlFS"                 "controlGS"                
 [31] "controlRS"                 "controlUS"                
 [33] "space, spacehackarabic"    "exclam"                   
 [35] "quotedbl"                  "numbersign"               
 [37] "dollar"                    "percent"                  
 [39] "ampersand"                 "quotesingle"              
 [41] "parenleft"                 "parenright"               
 [43] "asterisk"                  "plus"                     
 [45] "comma"                     "hyphen"                   
 [47] "period"                    "slash"                    
 [49] "zero"                      "one"                      
 [51] "two"                       "three"                    
 [53] "four"                      "five"                     
 [55] "six"                       "seven"                    
 [57] "eight"                     "nine"                     
 [59] "colon"                     "semicolon"                
 [61] "less"                      "equal"                    
 [63] "greater"                   "question"                 
 [65] "at"                        "A"                        
 [67] "B"                         "C"                        
 [69] "D"                         "E"                        
 [71] "F"                         "G"                        
 [73] "H"                         "I"                        
 [75] "J"                         "K"                        
 [77] "L"                         "M"                        
 [79] "N"                         "O"                        
 [81] "P"                         "Q"                        
 [83] "R"                         "S"                        
 [85] "T"                         "U"                        
 [87] "V"                         "W"                        
 [89] "X"                         "Y"                        
 [91] "Z"                         "bracketleft"              
 [93] "backslash"                 "bracketright"             
 [95] "asciicircum"               "underscore"               
 [97] "grave"                     "a"                        
 [99] "b"                         "c"                        
[101] "d"                         "e"                        
[103] "f"                         "g"                        
[105] "h"                         "i"                        
[107] "j"                         "k"                        
[109] "l"                         "m"                        
[111] "n"                         "o"                        
[113] "p"                         "q"                        
[115] "r"                         "s"                        
[117] "t"                         "u"                        
[119] "v"                         "w"                        
[121] "x"                         "y"                        
[123] "z"                         "braceleft"                
[125] "bar, verticalbar"          "braceright"               
[127] "asciitilde"                "controlDEL"               
[129] ""                          ""                         
[131] ""                          ""                         
[133] ""                          ""                         
[135] ""                          ""                         
[137] ""                          ""                         
[139] ""                          ""                         
[141] ""                          ""                         
[143] ""                          ""                         
[145] ""                          ""                         
[147] ""                          ""                         
[149] ""                          ""                         
[151] ""                          ""                         
[153] ""                          ""                         
[155] ""                          ""                         
[157] ""                          ""                         
[159] ""                          ""                         
[161] "nbspace, nonbreakingspace" "Aogonek"                  
[163] "breve"                     "Lslash"                   
[165] "currency"                  "Lcaron"                   
[167] "Sacute"                    "section"                  
[169] "dieresis"                  "Scaron"                   
[171] "Scedilla"                  "Tcaron"                   
[173] "Zacute"                    "sfthyphen, softhyphen"    
[175] "Zcaron"                    "Zdot, Zdotaccent"         
[177] "degree"                    "aogonek"                  
[179] "ogonek"                    "lslash"                   
[181] "acute"                     "lcaron"                   
[183] "sacute"                    "caron"                    
[185] "cedilla"                   "scaron"                   
[187] "scedilla"                  "tcaron"                   
[189] "zacute"                    "hungarumlaut"             
[191] "zcaron"                    "zdot, zdotaccent"         
[193] "Racute"                    "Aacute"                   
[195] "Acircumflex"               "Abreve"                   
[197] "Adieresis"                 "Lacute"                   
[199] "Cacute"                    "Ccedilla"                 
[201] "Ccaron"                    "Eacute"                   
[203] "Eogonek"                   "Edieresis"                
[205] "Ecaron"                    "Iacute"                   
[207] "Icircumflex"               "Dcaron"                   
[209] "Dcroat, Dslash"            "Nacute"                   
[211] "Ncaron"                    "Oacute"                   
[213] "Ocircumflex"               "Odblacute, Ohungarumlaut" 
[215] "Odieresis"                 "multiply"                 
[217] "Rcaron"                    "Uring"                    
[219] "Uacute"                    "Udblacute, Uhungarumlaut" 
[221] "Udieresis"                 "Yacute"                   
[223] "Tcedilla, Tcommaaccent"    "germandbls"               
[225] "racute"                    "aacute"                   
[227] "acircumflex"               "abreve"                   
[229] "adieresis"                 "lacute"                   
[231] "cacute"                    "ccedilla"                 
[233] "ccaron"                    "eacute"                   
[235] "eogonek"                   "edieresis"                
[237] "ecaron"                    "iacute"                   
[239] "icircumflex"               "dcaron"                   
[241] "dcroat, dmacron"           "nacute"                   
[243] "ncaron"                    "oacute"                   
[245] "ocircumflex"               "odblacute, ohungarumlaut" 
[247] "odieresis"                 "divide"                   
[249] "rcaron"                    "uring"                    
[251] "uacute"                    "udblacute, uhungarumlaut" 
[253] "udieresis"                 "yacute"                   
[255] "tcedilla, tcommaaccent"    "dotaccent"                
> 
> 
> 
> 
> 
> dev.off()
null device 
          1 
>