package charnames; use strict; use warnings; our $VERSION = '1.43'; use unicore::Name; # mktables-generated algorithmically-defined names use _charnames (); # The submodule for this where most of the work gets done use bytes (); # for $bytes::hint_bits use re "/aa"; # Everything in here should be ASCII # Translate between Unicode character names and their code points. # This is a wrapper around the submodule C<_charnames>. This design allows # C<_charnames> to be autoloaded to enable use of \N{...}, but requires this # module to be explicitly requested for the functions API. $Carp::Internal{ (__PACKAGE__) } = 1; sub import { shift; ## ignore class name _charnames->import(@_); } # Cache of already looked-up values. This is set to only contain # official values, and user aliases can't override them, so scoping is # not an issue. my %viacode; sub viacode { return _charnames::viacode(@_); } sub vianame { if (@_ != 1) { _charnames::carp "charnames::vianame() expects one name argument"; return () } # Looks up the character name and returns its ordinal if # found, undef otherwise. my $arg = shift; if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { # khw claims that this is poor interface design. The function should # return either a an ord or a chr for all inputs; not be bipolar. But # can't change it because of backward compatibility. New code can use # string_vianame() instead. my $ord = CORE::hex $1; return pack("U", $ord) if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits); _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord); return; } # The first 1 arg means wants an ord returned; the second that we are in # runtime, and this is the first level routine called from the user return _charnames::lookup_name($arg, 1, 1); } # vianame sub string_vianame { # Looks up the character name and returns its string representation if # found, undef otherwise. if (@_ != 1) { _charnames::carp "charnames::string_vianame() expects one name argument"; return; } my $arg = shift; if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { my $ord = CORE::hex $1; return pack("U", $ord) if $ord <= 255 || ! ((caller 0)[8] & $bytes::hint_bits); _charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord); return; } # The 0 arg means wants a string returned; the 1 arg means that we are in # runtime, and this is the first level routine called from the user return _charnames::lookup_name($arg, 0, 1); } # string_vianame 1; __END__ =encoding utf8 =head1 NAME charnames - access to Unicode character names and named character sequences; also define character names =head1 SYNOPSIS use charnames ':full'; print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}", " is an officially named sequence of two Unicode characters\n"; use charnames ':loose'; print "\N{Greek small-letter sigma}", "can be used to ignore case, underscores, most blanks," "and when you aren't sure if the official name has hyphens\n"; use charnames ':short'; print "\N{greek:Sigma} is an upper-case sigma.\n"; use charnames qw(cyrillic greek); print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; use utf8; use charnames ":full", ":alias" => { e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE", mychar => 0xE8000, # Private use area "自転車に乗る人" => "BICYCLIST" }; print "\N{e_ACUTE} is a small letter e with an acute.\n"; print "\N{mychar} allows me to name private use characters.\n"; print "And I can create synonyms in other languages,", " such as \N{自転車に乗る人} for "BICYCLIST (U+1F6B4)\n"; use charnames (); print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints # "10330" print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on # ASCII platforms; # 193 on EBCDIC print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A" =head1 DESCRIPTION Pragma C is used to gain access to the names of the Unicode characters and named character sequences, and to allow you to define your own character and character sequence names. All forms of the pragma enable use of the following 3 functions: =over =item * L)> for run-time lookup of a either a character name or a named character sequence, returning its string representation =item * L)> for run-time lookup of a character name (but not a named character sequence) to get its ordinal value (code point) =item * L)> for run-time lookup of a code point to get its Unicode name. =back Starting in Perl v5.16, any occurrence of C<\N{I}> sequences in a double-quotish string automatically loads this module with arguments C<:full> and C<:short> (described below) if it hasn't already been loaded with different arguments, in order to compile the named Unicode character into position in the string. Prior to v5.16, an explicit S> was required to enable this usage. (However, prior to v5.16, the form C> did not enable C<\N{I}>.) Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number, also inserts a character into a string. The character it inserts is the one whose Unicode code point (ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is the Unicode (white background, black foreground) smiley face equivalent to C<"\N{WHITE SMILING FACE}">. Also note, C<\N{I<...>}> can mean a regex quantifier instead of a character name, when the I<...> is a number (or comma separated pair of numbers (see L), and is not related to this pragma. The C pragma supports arguments C<:full>, C<:loose>, C<:short>, script names and L. If C<:full> is present, for expansion of C<\N{I}>, the string I is first looked up in the list of standard Unicode character names. C<:loose> is a variant of C<:full> which allows I to be less precisely specified. Details are in L. If C<:short> is present, and I has the form C:I>, then I is looked up as a letter in script I