-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #630 from MShwed/feature/mime-rfc2047
Feature: MIME RFC2047 Decoding
- Loading branch information
Showing
4 changed files
with
263 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,7 +76,8 @@ | |
"Rison Encode", | ||
"Rison Decode", | ||
"To Modhex", | ||
"From Modhex" | ||
"From Modhex", | ||
"MIME Decoding" | ||
] | ||
}, | ||
{ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
/** | ||
* @author mshwed [[email protected]] | ||
* @copyright Crown Copyright 2019 | ||
* @license Apache-2.0 | ||
*/ | ||
|
||
import Operation from "../Operation.mjs"; | ||
import OperationError from "../errors/OperationError.mjs"; | ||
import Utils from "../Utils.mjs"; | ||
import { fromHex } from "../lib/Hex.mjs"; | ||
import { fromBase64 } from "../lib/Base64.mjs"; | ||
import cptable from "codepage"; | ||
|
||
/** | ||
* MIME Decoding operation | ||
*/ | ||
class MIMEDecoding extends Operation { | ||
|
||
/** | ||
* MIMEDecoding constructor | ||
*/ | ||
constructor() { | ||
super(); | ||
|
||
this.name = "MIME Decoding"; | ||
this.module = "Default"; | ||
this.description = "Enables the decoding of MIME message header extensions for non-ASCII text"; | ||
this.infoURL = "https://tools.ietf.org/html/rfc2047"; | ||
this.inputType = "byteArray"; | ||
this.outputType = "string"; | ||
this.args = []; | ||
} | ||
|
||
/** | ||
* @param {byteArray} input | ||
* @param {Object[]} args | ||
* @returns {string} | ||
*/ | ||
run(input, args) { | ||
const mimeEncodedText = Utils.byteArrayToUtf8(input); | ||
const encodedHeaders = mimeEncodedText.replace(/\r\n/g, "\n"); | ||
|
||
const decodedHeader = this.decodeHeaders(encodedHeaders); | ||
|
||
return decodedHeader; | ||
} | ||
|
||
/** | ||
* Decode MIME header strings | ||
* | ||
* @param headerString | ||
*/ | ||
decodeHeaders(headerString) { | ||
// No encoded words detected | ||
let i = headerString.indexOf("=?"); | ||
if (i === -1) return headerString; | ||
|
||
let decodedHeaders = headerString.slice(0, i); | ||
let header = headerString.slice(i); | ||
|
||
let isBetweenWords = false; | ||
let start, cur, charset, encoding, j, end, text; | ||
while (header.length > -1) { | ||
start = header.indexOf("=?"); | ||
if (start === -1) break; | ||
cur = start + "=?".length; | ||
|
||
i = header.slice(cur).indexOf("?"); | ||
if (i === -1) break; | ||
|
||
charset = header.slice(cur, cur + i); | ||
cur += i + "?".length; | ||
|
||
if (header.length < cur + "Q??=".length) break; | ||
|
||
encoding = header[cur]; | ||
cur += 1; | ||
|
||
if (header[cur] !== "?") break; | ||
|
||
cur += 1; | ||
|
||
j = header.slice(cur).indexOf("?="); | ||
if (j === -1) break; | ||
|
||
text = header.slice(cur, cur + j); | ||
end = cur + j + "?=".length; | ||
|
||
if (encoding.toLowerCase() === "b") { | ||
text = fromBase64(text); | ||
} else if (encoding.toLowerCase() === "q") { | ||
text = this.parseQEncodedWord(text); | ||
} else { | ||
isBetweenWords = false; | ||
decodedHeaders += header.slice(0, start + 2); | ||
header = header.slice(start + 2); | ||
} | ||
|
||
if (start > 0 && (!isBetweenWords || header.slice(0, start).search(/\S/g) > -1)) { | ||
decodedHeaders += header.slice(0, start); | ||
} | ||
|
||
decodedHeaders += this.convertFromCharset(charset, text); | ||
|
||
header = header.slice(end); | ||
isBetweenWords = true; | ||
} | ||
|
||
if (header.length > 0) { | ||
decodedHeaders += header; | ||
} | ||
|
||
return decodedHeaders; | ||
} | ||
|
||
/** | ||
* Converts decoded text for supported charsets. | ||
* Supports UTF-8, US-ASCII, ISO-8859-* | ||
* | ||
* @param encodedWord | ||
*/ | ||
convertFromCharset(charset, encodedText) { | ||
charset = charset.toLowerCase(); | ||
const parsedCharset = charset.split("-"); | ||
|
||
if (parsedCharset.length === 2 && parsedCharset[0] === "utf" && charset === "utf-8") { | ||
return cptable.utils.decode(65001, encodedText); | ||
} else if (parsedCharset.length === 2 && charset === "us-ascii") { | ||
return cptable.utils.decode(20127, encodedText); | ||
} else if (parsedCharset.length === 3 && parsedCharset[0] === "iso" && parsedCharset[1] === "8859") { | ||
const isoCharset = parseInt(parsedCharset[2], 10); | ||
if (isoCharset >= 1 && isoCharset <= 16) { | ||
return cptable.utils.decode(28590 + isoCharset, encodedText); | ||
} | ||
} | ||
|
||
throw new OperationError("Unhandled Charset"); | ||
} | ||
|
||
/** | ||
* Parses a Q encoded word | ||
* | ||
* @param encodedWord | ||
*/ | ||
parseQEncodedWord(encodedWord) { | ||
let decodedWord = ""; | ||
for (let i = 0; i < encodedWord.length; i++) { | ||
if (encodedWord[i] === "_") { | ||
decodedWord += " "; | ||
// Parse hex encoding | ||
} else if (encodedWord[i] === "=") { | ||
if ((i + 2) >= encodedWord.length) throw new OperationError("Incorrectly Encoded Word"); | ||
const decodedHex = Utils.byteArrayToChars(fromHex(encodedWord.substring(i + 1, i + 3))); | ||
decodedWord += decodedHex; | ||
i += 2; | ||
} else if ( | ||
(encodedWord[i].charCodeAt(0) >= " ".charCodeAt(0) && encodedWord[i].charCodeAt(0) <= "~".charCodeAt(0)) || | ||
encodedWord[i] === "\n" || | ||
encodedWord[i] === "\r" || | ||
encodedWord[i] === "\t") { | ||
decodedWord += encodedWord[i]; | ||
} else { | ||
throw new OperationError("Incorrectly Encoded Word"); | ||
} | ||
} | ||
|
||
return decodedWord; | ||
} | ||
} | ||
|
||
export default MIMEDecoding; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
/** | ||
* MIME Header Decoding tests | ||
* | ||
* @author mshwed [[email protected]] | ||
* @copyright Crown Copyright 2019 | ||
* @license Apache-2.0 | ||
*/ | ||
|
||
import TestRegister from "../../lib/TestRegister.mjs"; | ||
|
||
TestRegister.addTests([ | ||
{ | ||
name: "Encoded comments", | ||
input: "(=?ISO-8859-1?Q?a?=)", | ||
expectedOutput: "(a)", | ||
recipeConfig: [ | ||
{ | ||
"op": "MIME Decoding", | ||
"args": [] | ||
} | ||
] | ||
}, | ||
{ | ||
name: "Encoded adjacent comments whitespace", | ||
input: "(=?ISO-8859-1?Q?a?= b)", | ||
expectedOutput: "(a b)", | ||
recipeConfig: [ | ||
{ | ||
"op": "MIME Decoding", | ||
"args": [] | ||
} | ||
] | ||
}, | ||
{ | ||
name: "Encoded adjacent single whitespace ignored", | ||
input: "(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)", | ||
expectedOutput: "(ab)", | ||
recipeConfig: [ | ||
{ | ||
"op": "MIME Decoding", | ||
"args": [] | ||
} | ||
] | ||
}, | ||
{ | ||
name: "Encoded adjacent double whitespace ignored", | ||
input: "(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)", | ||
expectedOutput: "(ab)", | ||
recipeConfig: [ | ||
{ | ||
"op": "MIME Decoding", | ||
"args": [] | ||
} | ||
] | ||
}, | ||
{ | ||
name: "Encoded adjacent CRLF whitespace ignored", | ||
input: "(=?ISO-8859-1?Q?a?=\r\n =?ISO-8859-1?Q?b?=)", | ||
expectedOutput: "(ab)", | ||
recipeConfig: [ | ||
{ | ||
"op": "MIME Decoding", | ||
"args": [] | ||
} | ||
] | ||
}, | ||
{ | ||
name: "UTF-8 Encodings Multiple Headers", | ||
input: "=?utf-8?q?=C3=89ric?= <[email protected]>, =?utf-8?q?Ana=C3=AFs?= <[email protected]>", | ||
expectedOutput: "Éric <[email protected]>, Anaïs <[email protected]>", | ||
recipeConfig: [ | ||
{ | ||
"op": "MIME Decoding", | ||
"args": [] | ||
} | ||
] | ||
}, | ||
{ | ||
name: "ISO Decoding", | ||
input: "From: =?US-ASCII?Q?Keith_Moore?= <[email protected]>\nTo: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <[email protected]>\nCC: =?ISO-8859-1?Q?Andr=E9?= Pirard <[email protected]>\nSubject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\n=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=", | ||
expectedOutput: "From: Keith Moore <[email protected]>\nTo: Keld Jørn Simonsen <[email protected]>\nCC: André Pirard <[email protected]>\nSubject: If you can read this you understand the example.", | ||
recipeConfig: [ | ||
{ | ||
"op": "MIME Decoding", | ||
"args": [] | ||
} | ||
] | ||
} | ||
]); |