# -*-Perl-*-
################################################################
###
### Japanese.pm
###
### Author: Internet Message Group
### Created: Apr 23, 1997
### Revised: Jul 4, 2004
###
my $PM_VERSION = "IM::Japanese.pm version 20031028(IM146)";
package IM::Japanese;
require 5.003;
require Exporter;
use IM::Util;
use integer;
use strict;
use vars qw(@ISA @EXPORT);
@ISA = qw(Exporter);
@EXPORT = qw(code_check code_check_body
convert_iso2022jp_body conv_iso2022jp
conv_euc_from_jis conv_euc_from_sjis);
use vars qw($C_jis $C_jis_roman $C_sjis $C_sjis_kana
$C_euc $C_euc_kana $C_SorE $C_ascii
$C_pascii $C_tascii $C_sascii $C_8bit
$E_jp $E_asc $E_kana);
BEGIN {
$C_jis = '(\e\$[\@B])([\x21-\x7e]+)';
$C_jis_roman = '(\e\([BJ])([\s\x21-\x7e]*)';
$C_sjis = '[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]';
$C_sjis_kana = '[\xa1-\xdf]';
$C_euc = '[\xa1-\xfe][\xa1-\xfe]';
$C_euc_kana = '\x8e[\xa1-\xdf]';
$C_SorE = '[\xa1-\xdf]|[\x8e\xe0-\xfc][\xa1-\xfc]';
$C_ascii = '[\s\x21-\x7e]';
# $C_ascii = '[\x07\s\x21-\x7e]'; # for IRC freak :-)
$C_pascii = '[\x21-\x7e]';
$C_tascii = '[\x21\x23-\x27\x2a\x2b\x2d\x30-\x39\x41-\x5a\x5e-\x7e]';
$C_sascii = '[\x22\x28\x29\x2c\x2e\x2f\x3a-\x40\x5b-\x5d]';
$C_8bit = '[\x80-\xff]';
($E_jp, $E_asc, $E_kana) = ("\e\$B", "\e(B", "\e(I");
}
##### CODE CHECKER #####
#
# code_check(line, use_hankaku_kana)
# line: a line of string to be checked
# use_hankaku_kana: bool value if check hankaku kana
# return value: encoding type
# ascii
# 8bit
# jis
# euc
# sjis
# sORe
#
sub code_check($;$) {
my($line, $no_hankaku_kana) = @_;
my($sjis, $euc);
if ($line =~ /^$C_ascii*$/o) {
return 'ascii';
} elsif ($line =~ /$C_jis/o) {
return 'jis';
}
if ($no_hankaku_kana) {
$euc = 1 if $line =~ /^($C_ascii|$C_euc)+$/o;
$sjis = 1 if $line =~ /^($C_ascii|$C_sjis)+$/o;
} else {
$euc = 1 if $line =~ /^($C_ascii|$C_euc|$C_euc_kana)+$/o;
$sjis = 1 if $line =~ /^($C_ascii|$C_sjis|$C_sjis_kana)+$/o;
}
if ($euc && $sjis) {
return 'sORe';
} elsif ($euc) {
return 'euc';
} elsif ($sjis) {
return 'sjis';
}
return '8bit';
}
##### BODY CODE CHECKER #####
#
# code_check_body(content)
# content: pointer to body content line list
# return value: encode type
# ASCII
# 8BIT
# JIS
# EUC
# SJIS
#
sub code_check_body($) {
my $content = shift;
my(%count) = ();
$count{'ascii'} = 0; # for debug print
$count{'8bit'} = 0;
$count{'jis'} = 0;
$count{'euc'} = 0;
$count{'sjis'} = 0;
$count{'sORe'} = 0;
$count{'has8bit'} = 0;
$count{'total'} = 0;
my $i;
for ($i = 0; $i <= $#$content; $i++) {
$count{code_check($$content[$i])}++;
my $line = $$content[$i];
$count{'total'} += length($line);
$line =~ s/[^\x80-\xff]//g;
$count{'has8bit'} += length($line);
}
# select encoding
if ($count{'has8bit'} * 8 > $count{'total'}) {
$main::Need_base64_encoded = 1;
} else {
$main::Need_base64_encoded = 0;
}
if (&debug('code')) {
im_debug("ascii = $count{'ascii'}\n");
im_debug("8bit = $count{'8bit'}\n");
im_debug("jis = $count{'jis'}\n");
im_debug("euc = $count{'euc'}\n");
im_debug("sjis = $count{'sjis'}\n");
im_debug("sORe = $count{'sORe'}\n");
}
return '8BIT' if ($count{'8bit'});
if ($count{'jis'}) {
return '8BIT'
if ($count{'sORe'} || $count{'sjis'} || $count{'euc'});
return 'JIS';
}
if ($count{'sjis'}) {
return '8BIT' if ($count{'euc'});
return 'SJIS';
}
return 'EUC' if ($count{'euc'});
return uc($main::Default_code) if ($count{'sORe'});
return 'ASCII';
}
##### CONVERT BODY INTO ISO-2022-JP ENCODING #####
#
# convert_iso2022jp_body(content, code)
# content: pointer to body content line list
# code: input kanji code
# return value: none
#
sub convert_iso2022jp_body($$) {
my($content, $code) = @_;
my $i;
for ($i = 0; $i <= $#$content; $i++) {
$$content[$i] = conv_iso2022jp($$content[$i], $code);
}
}
##### ISO-2022-JP CODE CONVERSION #####
#
# conv_iso2022jp(line, code)
# line: a line of string to be converted
# code: input kanji code
# return value: converted line
#
sub conv_iso2022jp($;$) {
my($line, $code) = @_;
im_debug("conv_iso2022jp: $line\n") if (&debug('japanese'));
unless ($line =~ /[\x80-\xff]/) {
im_debug("source is ascii or jis\n") if (&debug('japanese'));
return $line;
}
if ($code eq 'NoHankana') {
$code = uc(code_check($line, 1));
} elsif (!defined($code)) {
$code = uc(code_check($line));
}
$code = uc($main::Default_code) if ($code eq 'SORE');
if ($code eq 'EUC') {
im_debug("source is euc\n") if (&debug('japanese'));
return &conv_from_euc($line);
} elsif ($code eq 'SJIS') {
im_debug("source is sjis\n") if (&debug('japanese'));
return &conv_from_sjis($line);
}
im_debug("source is unknown, nothing done\n") if (&debug('japanese'));
return $line;
}
##### ISO-2022-JP CODE CONVERSION FROM SJIS #####
#
# conv_from_sjis(line)
# line: a line of string to be converted
# return value: converted line
#
sub conv_from_sjis($) {
my $line = shift;
$line =~ s/((?:$C_sjis|$C_sjis_kana)+)/sjis2jis($1)/geo;
return $line;
}
sub sjis2jis($) {
my $line = shift;
$line =~ s/((?:$C_sjis)+|(?:$C_sjis_kana)+)/s2j($1)/geo;
return "$line$E_asc";
}
sub s2e($) {
my $code = shift;
my($c1, $c2) = unpack('CC', $code);
if (0xa1 <= $c1 && $c1 <= 0xdf) {
$c2 = $c1;
$c1 = 0x8e;
} elsif ($c2 >= 0x9f) {
$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60);
$c2 += 2;
} else {
$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61);
$c2 += 0x60 + ($c2 < 0x7f);
}
return pack('CC', $c1, $c2);
}
sub s2j($) {
my $cur = shift;
if ($cur =~ /^$C_sjis_kana/o) {
$cur =~ tr/\xa1-\xdf/\x21-\x5f/;
return "$E_kana$cur";
} else {
$cur =~ s/(..)/s2e($1)/ge;
$cur =~ tr/\xa1-\xfe/\x21-\x7e/;
return "$E_jp$cur";
}
}
##### ISO-2022-JP CODE CONVERSION FROM EUC #####
#
# conv_from_euc(line)
# line: a line of string to be converted
# return value: converted line
#
sub conv_from_euc($) {
my $line = shift;
$line =~ s/((?:$C_euc|$C_euc_kana)+)/euc2jis($1)/geo;
return $line;
}
sub euc2jis($) {
my $line = shift;
$line =~ s/((?:$C_euc)+|(?:$C_euc_kana)+)/e2j($1)/geo;
return "$line$E_asc";
}
sub e2j($) {
my $cur = shift;
$cur =~ tr/\xa1-\xfe/\x21-\x7e/;
if ($cur =~ tr/\x8e//d) {
return "$E_kana$cur";
} else {
return "$E_jp$cur";
}
}
##### EUC CODE CONVERSION FROM SJIS #####
#
# conv_euc_from_sjis(line)
# line: a line of string to be converted
# return value: converted line
#
sub conv_euc_from_sjis($) {
my $line = shift;
$line =~ s/($C_sjis|$C_sjis_kana)/s2e($1)/geo;
return $line;
}
##### EUC CODE CONVERSION FROM JIS #####
#
# conv_euc_from_jis(line)
# line: a line of string to be converted
# return value: converted line
#
sub conv_euc_from_jis($) {
my $line = shift;
$line =~ s/$C_jis/j2e($1,$2)/geo;
$line =~ s/\e\$C_jis_roman/$2/geo;
return $line;
}
sub j2e($$) {
my $esc = shift;
my $line = shift;
if ($esc =~ /\e\$[\@B]/) {
$line =~ tr/\x21-\x7e/\xa1-\xfe/;
}
return $line;
}
1;
__END__
=head1 NAME
IM::Japanese - Japanese message handler
=head1 SYNOPSIS
use IM::Japanese;
$code = code_check($line, $use_hankaku_kana);
$code = code_check_body($content);
convert_iso2022jp_body($content, $code);
$converted = conv_iso2022jp($line, $code);
=head1 DESCRIPTION
The I module handles Japanese message encoded with
ISO-2022-JP, EUC-JP, Shift_JIS, or US-ASCII.
This modules is provided by IM (Internet Message).
=head1 COPYRIGHT
IM (Internet Message) is copyrighted by IM developing team.
You can redistribute it and/or modify it under the modified BSD
license. See the copyright file for more details.
=cut
### Copyright (C) 1997, 1998, 1999 IM developing team
### All rights reserved.
###
### Redistribution and use in source and binary forms, with or without
### modification, are permitted provided that the following conditions
### are met:
###
### 1. Redistributions of source code must retain the above copyright
### notice, this list of conditions and the following disclaimer.
### 2. Redistributions in binary form must reproduce the above copyright
### notice, this list of conditions and the following disclaimer in the
### documentation and/or other materials provided with the distribution.
### 3. Neither the name of the team nor the names of its contributors
### may be used to endorse or promote products derived from this software
### without specific prior written permission.
###
### THIS SOFTWARE IS PROVIDED BY THE TEAM AND CONTRIBUTORS ``AS IS'' AND
### ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
### IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
### PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE TEAM OR CONTRIBUTORS BE
### LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
### CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
### SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
### BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
### WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
### OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
### IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.