From 3eeaeefc677bbdfc0002b23f40c2c1f823a8a778 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 15 Jul 2022 12:36:38 +0900 Subject: [PATCH] Add support for transcoding String CSV GitHub: fix ruby/csv#254 Syntax is "from-encoding:to-encoding". Reported by Richard Stueven. Thanks!!! --- lib/csv.rb | 13 ++++++++++++- test/csv/test_encodings.rb | 31 +++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/lib/csv.rb b/lib/csv.rb index a11cceb4..1635bda0 100644 --- a/lib/csv.rb +++ b/lib/csv.rb @@ -1889,8 +1889,19 @@ def initialize(data, raise ArgumentError.new("Cannot parse nil as CSV") if data.nil? if data.is_a?(String) + if encoding + if encoding.is_a?(String) + data_external_encoding, data_internal_encoding = encoding.split(":", 2) + if data_internal_encoding + data = data.encode(data_internal_encoding, data_external_encoding) + else + data = data.dup.force_encoding(data_external_encoding) + end + else + data = data.dup.force_encoding(encoding) + end + end @io = StringIO.new(data) - @io.set_encoding(encoding || data.encoding) else @io = data end diff --git a/test/csv/test_encodings.rb b/test/csv/test_encodings.rb index 8d228c05..f08d551f 100644 --- a/test/csv/test_encodings.rb +++ b/test/csv/test_encodings.rb @@ -288,6 +288,37 @@ def test_invalid_encoding_row_error error.message) end + def test_string_input_transcode + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + # U+3046 HIRAGANA LETTER U + value = "\u3042\u3044\u3046" + csv = CSV.new(value, encoding: "UTF-8:EUC-JP") + assert_equal([[value.encode("EUC-JP")]], + csv.read) + end + + def test_string_input_set_encoding_string + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + # U+3046 HIRAGANA LETTER U + value = "\u3042\u3044\u3046".encode("EUC-JP") + csv = CSV.new(value.dup.force_encoding("UTF-8"), encoding: "EUC-JP") + assert_equal([[value.encode("EUC-JP")]], + csv.read) + end + + def test_string_input_set_encoding_encoding + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + # U+3046 HIRAGANA LETTER U + value = "\u3042\u3044\u3046".encode("EUC-JP") + csv = CSV.new(value.dup.force_encoding("UTF-8"), + encoding: Encoding.find("EUC-JP")) + assert_equal([[value.encode("EUC-JP")]], + csv.read) + end + private def assert_parses(fields, encoding, **options)