##// END OF EJS Templates
do not replace all invalid utf8 (#24616)...
Toshi MARUYAMA -
r15891:b2c0ea2c3e15
parent child
Show More
@@ -1,68 +1,68
1
1
2 module Redmine
2 module Redmine
3 module CodesetUtil
3 module CodesetUtil
4
4
5 def self.replace_invalid_utf8(str)
5 def self.replace_invalid_utf8(str)
6 return str if str.nil?
6 return str if str.nil?
7 str.force_encoding('UTF-8')
7 str.force_encoding('UTF-8')
8 if ! str.valid_encoding?
8 if ! str.valid_encoding?
9 str = str.encode("US-ASCII", :invalid => :replace,
9 str = str.encode("UTF-16LE", :invalid => :replace,
10 :undef => :replace, :replace => '?').encode("UTF-8")
10 :undef => :replace, :replace => '?').encode("UTF-8")
11 end
11 end
12 str
12 str
13 end
13 end
14
14
15 def self.to_utf8(str, encoding)
15 def self.to_utf8(str, encoding)
16 return str if str.nil?
16 return str if str.nil?
17 str.force_encoding("ASCII-8BIT")
17 str.force_encoding("ASCII-8BIT")
18 if str.empty?
18 if str.empty?
19 str.force_encoding("UTF-8")
19 str.force_encoding("UTF-8")
20 return str
20 return str
21 end
21 end
22 enc = encoding.blank? ? "UTF-8" : encoding
22 enc = encoding.blank? ? "UTF-8" : encoding
23 if enc.upcase != "UTF-8"
23 if enc.upcase != "UTF-8"
24 str.force_encoding(enc)
24 str.force_encoding(enc)
25 str = str.encode("UTF-8", :invalid => :replace,
25 str = str.encode("UTF-8", :invalid => :replace,
26 :undef => :replace, :replace => '?')
26 :undef => :replace, :replace => '?')
27 else
27 else
28 str = replace_invalid_utf8(str)
28 str = replace_invalid_utf8(str)
29 end
29 end
30 str
30 str
31 end
31 end
32
32
33 def self.to_utf8_by_setting(str)
33 def self.to_utf8_by_setting(str)
34 return str if str.nil?
34 return str if str.nil?
35 self.to_utf8_by_setting_internal(str).force_encoding('UTF-8')
35 self.to_utf8_by_setting_internal(str).force_encoding('UTF-8')
36 end
36 end
37
37
38 def self.to_utf8_by_setting_internal(str)
38 def self.to_utf8_by_setting_internal(str)
39 return str if str.nil?
39 return str if str.nil?
40 str.force_encoding('ASCII-8BIT')
40 str.force_encoding('ASCII-8BIT')
41 return str if str.empty?
41 return str if str.empty?
42 return str if /\A[\r\n\t\x20-\x7e]*\Z/n.match(str) # for us-ascii
42 return str if /\A[\r\n\t\x20-\x7e]*\Z/n.match(str) # for us-ascii
43 str.force_encoding('UTF-8')
43 str.force_encoding('UTF-8')
44 encodings = Setting.repositories_encodings.split(',').collect(&:strip)
44 encodings = Setting.repositories_encodings.split(',').collect(&:strip)
45 encodings.each do |encoding|
45 encodings.each do |encoding|
46 begin
46 begin
47 str.force_encoding(encoding)
47 str.force_encoding(encoding)
48 utf8 = str.encode('UTF-8')
48 utf8 = str.encode('UTF-8')
49 return utf8 if utf8.valid_encoding?
49 return utf8 if utf8.valid_encoding?
50 rescue
50 rescue
51 # do nothing here and try the next encoding
51 # do nothing here and try the next encoding
52 end
52 end
53 end
53 end
54 self.replace_invalid_utf8(str).force_encoding('UTF-8')
54 self.replace_invalid_utf8(str).force_encoding('UTF-8')
55 end
55 end
56
56
57 def self.from_utf8(str, encoding)
57 def self.from_utf8(str, encoding)
58 str ||= ''
58 str ||= ''
59 str.force_encoding('UTF-8')
59 str.force_encoding('UTF-8')
60 if encoding.upcase != 'UTF-8'
60 if encoding.upcase != 'UTF-8'
61 str = str.encode(encoding, :invalid => :replace,
61 str = str.encode(encoding, :invalid => :replace,
62 :undef => :replace, :replace => '?')
62 :undef => :replace, :replace => '?')
63 else
63 else
64 str = self.replace_invalid_utf8(str)
64 str = self.replace_invalid_utf8(str)
65 end
65 end
66 end
66 end
67 end
67 end
68 end
68 end
@@ -1,104 +1,104
1 # Redmine - project management software
1 # Redmine - project management software
2 # Copyright (C) 2006-2016 Jean-Philippe Lang
2 # Copyright (C) 2006-2016 Jean-Philippe Lang
3 #
3 #
4 # This program is free software; you can redistribute it and/or
4 # This program is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU General Public License
5 # modify it under the terms of the GNU General Public License
6 # as published by the Free Software Foundation; either version 2
6 # as published by the Free Software Foundation; either version 2
7 # of the License, or (at your option) any later version.
7 # of the License, or (at your option) any later version.
8 #
8 #
9 # This program is distributed in the hope that it will be useful,
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
12 # GNU General Public License for more details.
13 #
13 #
14 # You should have received a copy of the GNU General Public License
14 # You should have received a copy of the GNU General Public License
15 # along with this program; if not, write to the Free Software
15 # along with this program; if not, write to the Free Software
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17
17
18 require File.expand_path('../../../../test_helper', __FILE__)
18 require File.expand_path('../../../../test_helper', __FILE__)
19
19
20 class Redmine::CodesetUtilTest < ActiveSupport::TestCase
20 class Redmine::CodesetUtilTest < ActiveSupport::TestCase
21
21
22 def test_to_utf8_by_setting_from_latin1
22 def test_to_utf8_by_setting_from_latin1
23 with_settings :repositories_encodings => 'UTF-8,ISO-8859-1' do
23 with_settings :repositories_encodings => 'UTF-8,ISO-8859-1' do
24 s1 = "Texte encod\xc3\xa9".force_encoding("UTF-8")
24 s1 = "Texte encod\xc3\xa9".force_encoding("UTF-8")
25 s2 = "Texte encod\xe9".force_encoding("ASCII-8BIT")
25 s2 = "Texte encod\xe9".force_encoding("ASCII-8BIT")
26 s3 = s2.dup.force_encoding("UTF-8")
26 s3 = s2.dup.force_encoding("UTF-8")
27 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s2)
27 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s2)
28 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s3)
28 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s3)
29 end
29 end
30 end
30 end
31
31
32 def test_to_utf8_by_setting_from_euc_jp
32 def test_to_utf8_by_setting_from_euc_jp
33 with_settings :repositories_encodings => 'UTF-8,EUC-JP' do
33 with_settings :repositories_encodings => 'UTF-8,EUC-JP' do
34 s1 = "\xe3\x83\xac\xe3\x83\x83\xe3\x83\x89\xe3\x83\x9e\xe3\x82\xa4\xe3\x83\xb3".force_encoding("UTF-8")
34 s1 = "\xe3\x83\xac\xe3\x83\x83\xe3\x83\x89\xe3\x83\x9e\xe3\x82\xa4\xe3\x83\xb3".force_encoding("UTF-8")
35 s2 = "\xa5\xec\xa5\xc3\xa5\xc9\xa5\xde\xa5\xa4\xa5\xf3".force_encoding("ASCII-8BIT")
35 s2 = "\xa5\xec\xa5\xc3\xa5\xc9\xa5\xde\xa5\xa4\xa5\xf3".force_encoding("ASCII-8BIT")
36 s3 = s2.dup.force_encoding("UTF-8")
36 s3 = s2.dup.force_encoding("UTF-8")
37 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s2)
37 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s2)
38 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s3)
38 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s3)
39 end
39 end
40 end
40 end
41
41
42 def test_to_utf8_by_setting_should_be_converted_all_latin1
42 def test_to_utf8_by_setting_should_be_converted_all_latin1
43 with_settings :repositories_encodings => 'ISO-8859-1' do
43 with_settings :repositories_encodings => 'ISO-8859-1' do
44 s1 = "\xc3\x82\xc2\x80".force_encoding("UTF-8")
44 s1 = "\xc3\x82\xc2\x80".force_encoding("UTF-8")
45 s2 = "\xC2\x80".force_encoding("ASCII-8BIT")
45 s2 = "\xC2\x80".force_encoding("ASCII-8BIT")
46 s3 = s2.dup.force_encoding("UTF-8")
46 s3 = s2.dup.force_encoding("UTF-8")
47 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s2)
47 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s2)
48 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s3)
48 assert_equal s1, Redmine::CodesetUtil.to_utf8_by_setting(s3)
49 end
49 end
50 end
50 end
51
51
52 def test_to_utf8_by_setting_blank_string
52 def test_to_utf8_by_setting_blank_string
53 assert_equal "", Redmine::CodesetUtil.to_utf8_by_setting("")
53 assert_equal "", Redmine::CodesetUtil.to_utf8_by_setting("")
54 assert_nil Redmine::CodesetUtil.to_utf8_by_setting(nil)
54 assert_nil Redmine::CodesetUtil.to_utf8_by_setting(nil)
55 end
55 end
56
56
57 def test_to_utf8_by_setting_returns_ascii_as_utf8
57 def test_to_utf8_by_setting_returns_ascii_as_utf8
58 s1 = "ASCII".force_encoding("UTF-8")
58 s1 = "ASCII".force_encoding("UTF-8")
59 s2 = s1.dup.force_encoding("ISO-8859-1")
59 s2 = s1.dup.force_encoding("ISO-8859-1")
60 str1 = Redmine::CodesetUtil.to_utf8_by_setting(s1)
60 str1 = Redmine::CodesetUtil.to_utf8_by_setting(s1)
61 str2 = Redmine::CodesetUtil.to_utf8_by_setting(s2)
61 str2 = Redmine::CodesetUtil.to_utf8_by_setting(s2)
62 assert_equal s1, str1
62 assert_equal s1, str1
63 assert_equal s1, str2
63 assert_equal s1, str2
64 assert_equal "UTF-8", str1.encoding.to_s
64 assert_equal "UTF-8", str1.encoding.to_s
65 assert_equal "UTF-8", str2.encoding.to_s
65 assert_equal "UTF-8", str2.encoding.to_s
66 end
66 end
67
67
68 def test_to_utf8_by_setting_invalid_utf8_sequences_should_be_stripped
68 def test_to_utf8_by_setting_invalid_utf8_sequences_should_be_stripped
69 with_settings :repositories_encodings => '' do
69 with_settings :repositories_encodings => '' do
70 # s1 = File.read("#{RAILS_ROOT}/test/fixtures/encoding/iso-8859-1.txt")
70 # s1 = File.read("#{RAILS_ROOT}/test/fixtures/encoding/iso-8859-1.txt")
71 s1 = "Texte encod\xe9 en ISO-8859-1.".force_encoding("ASCII-8BIT")
71 s1 = "Texte encod\xe9 en ISO-8859-1.".force_encoding("ASCII-8BIT")
72 str = Redmine::CodesetUtil.to_utf8_by_setting(s1)
72 str = Redmine::CodesetUtil.to_utf8_by_setting(s1)
73 assert str.valid_encoding?
73 assert str.valid_encoding?
74 assert_equal "UTF-8", str.encoding.to_s
74 assert_equal "UTF-8", str.encoding.to_s
75 assert_equal "Texte encod? en ISO-8859-1.", str
75 assert_equal "Texte encod? en ISO-8859-1.", str
76 end
76 end
77 end
77 end
78
78
79 def test_to_utf8_by_setting_invalid_utf8_sequences_should_be_stripped_ja_jis
79 def test_to_utf8_by_setting_invalid_utf8_sequences_should_be_stripped_ja_jis
80 with_settings :repositories_encodings => 'ISO-2022-JP' do
80 with_settings :repositories_encodings => 'ISO-2022-JP' do
81 s1 = "test\xb5\xfetest\xb5\xfe".force_encoding("ASCII-8BIT")
81 s1 = "test\xb5\xfetest\xb5\xfe".force_encoding("ASCII-8BIT")
82 str = Redmine::CodesetUtil.to_utf8_by_setting(s1)
82 str = Redmine::CodesetUtil.to_utf8_by_setting(s1)
83 assert str.valid_encoding?
83 assert str.valid_encoding?
84 assert_equal "UTF-8", str.encoding.to_s
84 assert_equal "UTF-8", str.encoding.to_s
85 assert_equal "test??test??", str
85 assert_equal "test??test??", str
86 end
86 end
87 end
87 end
88
88
89 test "#replace_invalid_utf8 should replace invalid utf8" do
89 test "#replace_invalid_utf8 should replace invalid utf8" do
90 s1 = "\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xE3\x81\xFF".force_encoding("UTF-8")
90 s1 = "\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xE3\x81\xFF".force_encoding("UTF-8")
91 s2 = Redmine::CodesetUtil.replace_invalid_utf8(s1)
91 s2 = Redmine::CodesetUtil.replace_invalid_utf8(s1)
92 assert s2.valid_encoding?
92 assert s2.valid_encoding?
93 assert_equal "UTF-8", s2.encoding.to_s
93 assert_equal "UTF-8", s2.encoding.to_s
94 assert_equal "??????", s2
94 assert_equal "\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1??".force_encoding("UTF-8"), s2
95 end
95 end
96
96
97 test "#to_utf8 should replace invalid non utf8" do
97 test "#to_utf8 should replace invalid non utf8" do
98 s1 = "\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4".force_encoding("EUC-JP")
98 s1 = "\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4".force_encoding("EUC-JP")
99 s2 = Redmine::CodesetUtil.to_utf8(s1, "EUC-JP")
99 s2 = Redmine::CodesetUtil.to_utf8(s1, "EUC-JP")
100 assert s2.valid_encoding?
100 assert s2.valid_encoding?
101 assert_equal "UTF-8", s2.encoding.to_s
101 assert_equal "UTF-8", s2.encoding.to_s
102 assert_equal "\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1?".force_encoding("UTF-8"), s2
102 assert_equal "\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1?".force_encoding("UTF-8"), s2
103 end
103 end
104 end
104 end
General Comments 0
You need to be logged in to leave comments. Login now