Skip to content

Commit d3a8b46

Browse files
committed
Looks like this encoding works much better
1 parent 8a1f258 commit d3a8b46

File tree

5 files changed

+78
-42
lines changed

5 files changed

+78
-42
lines changed

Changes.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# 1.8.3 - 2022-04-22
1+
# 1.8.3 - 2022-05-07
22

33
- Updated email parser body
44
- Fix for Redis/KeyDB crash when plugin is started as delivery only

email_body_utility.js

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
const EmailBodyUtility = function() {
22
const stream = require('stream');
33

4+
// const ced = require('ced');
45
const async = require('async');
5-
// const IsBase64 = require('is-base64');
66
const linkify = require('linkify-it')();
7-
const ced = require('ced');
87
const Splitter = require('mailsplit').Splitter;
9-
// const detectCharacterEncoding = require('detect-character-encoding');
8+
const detectCharacterEncoding = require('detect-character-encoding');
109

1110
const quotedPrintable = require('quoted-printable');
1211

13-
const _default_html_field_order = 'bodytext_html mailparser_html mailparser_text_as_html'.split(' ');
12+
const _default_html_field_order = 'mailparser_html bodytext_html mailparser_text_as_html'.split(' ');
1413
const _default_text_field_order = 'bodytext_plain mailparser_text'.split(' ');
1514

1615
const _haraka_bodytext_variations = 'haraka_bodytext haraka_body_text_encoded'.split(' ');
@@ -20,7 +19,7 @@ const EmailBodyUtility = function() {
2019

2120
const _iso_8859_charset_regex = /text\/html; charset=iso-8859-\d/img;
2221
const _windows_charset_regex = /text\/html;\s*charset=Windows-125(2|7)/img;
23-
const _is_base64_encoded_regex = /^(?:[A-Za-z\d+/]{4})*(?:[A-Za-z\d+/]{3}=|[A-Za-z\d+/]{2}==)?$/mg;
22+
const _is_base64_encoded_regex = /w\+?\+\s*$/;
2423

2524
const _uses_windows_1257_charset = /charset=Windows-1257/im;
2625
const _contains_html_invalid_unicode = /\x82/;
@@ -69,13 +68,13 @@ const EmailBodyUtility = function() {
6968
var text_field_order = prefer_mailparser ? 'mailparser_text bodytext_plain'.split(' ') : _default_text_field_order;
7069

7170
_log_module && console.log(`\ngetHtmlAndTextBody(), extracting 'html'...`);
72-
var html_info = !options.ignore_html_result ? _extractBody(email_obj, body, html_field_order, options) : { result: '' };
73-
_log_module && !has_rfc_822_message && console.log(`\ngetHtmlAndTextBody(), html result came from '${html_info.source}' and has a length of '${html_info.result.length}'`);
71+
var html_info = ! options.ignore_html_result ? _extractBody(email_obj, body, html_field_order, options) : { result: '' };
72+
_log_module && ! has_rfc_822_message && console.log(`\ngetHtmlAndTextBody(), html result came from '${html_info.source}' and has a length of '${html_info.result.length}'`);
7473

7574
_log_module && console.log(`\ngetHtmlAndTextBody(), extracting 'text'...`);
76-
var text_info = !options.ignore_text_result ? _extractBody(email_obj, body, text_field_order, options) : { result: '' };
75+
var text_info = ! options.ignore_text_result ? _extractBody(email_obj, body, text_field_order, options) : { result: '' };
7776

78-
_log_module && !has_rfc_822_message && console.log(`\ngetHtmlAndTextBody(), text result came from '${text_info.source}' and has a length of '${text_info.result.length}'`);
77+
_log_module && ! has_rfc_822_message && console.log(`\ngetHtmlAndTextBody(), text result came from '${text_info.source}' and has a length of '${text_info.result.length}'`);
7978
return waterfall_callback(null, html_info, text_info);
8079
},
8180
/* extract and append rfc822 info if present
@@ -98,14 +97,17 @@ const EmailBodyUtility = function() {
9897
return waterfall_callback(null, html_info, text_info);
9998
});
10099
},
101-
/* analyse results and overwrite html if text is better parsed */
100+
/* analyze results and overwrite html if text is better parsed */
102101
function(html_info, text_info, waterfall_callback) {
103102

104-
var use_text_for_html = !html_info.result // if we have no html result
103+
var use_text_for_html = ! html_info.result // if we have no html result
105104
||
106-
(text_info.result && html_info.source.includes('mailparser') && !text_info.source.includes('mailparser')) // if we have a text result, and the html result was from mailparser
105+
(text_info.result && html_info.source.includes('mailparser') && ! text_info.source.includes('mailparser') // if we have a text result, and the html result was from mailparser
107106
||
108-
(!html_info.has_valid_encoding && text_info.has_valid_encoding); // or we could not properly decode the content for the html but we could for the text
107+
(! html_info.has_valid_encoding && text_info.has_valid_encoding) // or we could not properly decode the content for the html but we could for the text
108+
);
109+
110+
var use_text_for_html = ! html_info.result;
109111

110112
// override any html mailparser result we have if there's a valid text result
111113
if (use_text_for_html) {
@@ -223,7 +225,11 @@ const EmailBodyUtility = function() {
223225
var field = field_order[i++];
224226
var result = getBodyByField(email_obj, body, field);
225227

226-
_log_module && console.log(`checking field '${field.toUpperCase()}', string: ${(result.body || '').substring(0,50)}...\n\n`);
228+
_log_module && console.log(`checking field '${field.toUpperCase()}', string: "${(result.body || '').substring(0,150)}..."\n`);
229+
// if result is unicode, then set set the result.body to null
230+
var is_base64_encoded = _is_base64_encoded_regex.test(result.body);
231+
_log_module && console.log('is_base64_encoded:', is_base64_encoded)
232+
if (is_base64_encoded) { result.body = null; }
227233

228234
var is_base64_encoded = false;
229235
if (result.body && typeof result.body === 'string' && result.body.length) {
@@ -245,7 +251,7 @@ const EmailBodyUtility = function() {
245251
result.body = null;
246252
}
247253

248-
! is_base64_encoded && _log_module && console.log(`\n\nbase64 NOT FOUND for field'${field.toUpperCase()}', string: ${string_body.substring(0,50)}...\n\n`);
254+
! is_base64_encoded && _log_module && console.log(`\n\nbase64 NOT FOUND for field '${field.toUpperCase()}', string: ${string_body.substring(0,50)}...\n\n`);
249255
}
250256
}
251257

@@ -326,7 +332,7 @@ const EmailBodyUtility = function() {
326332
};
327333

328334
default:
329-
console.log(`unknown field type requested for body field: '${field}'`);
335+
_log_module && console.log(`unknown field type requested for body field: '${field}'`);
330336
return {
331337
'body': '',
332338
'source': 'none'
@@ -343,6 +349,7 @@ const EmailBodyUtility = function() {
343349
var is_matching_node = is_requested_type && (haraka_obj.bodytext || haraka_obj.body_text_encoded)
344350
_log_module && !is_matching_node && console.log(`${'\t'.repeat(depth)} [${index}] not a matching node for type '${type}'`);
345351

352+
346353
if (is_matching_node) {
347354
_log_module && console.log(`${'\t'.repeat(depth)} [${index}] found a matching bodytype of length '${haraka_obj.bodytext.length || haraka_obj.body_text_encoded.length}' for type '${type}'`);
348355

@@ -361,8 +368,13 @@ const EmailBodyUtility = function() {
361368
var bodytext_encoding = {};
362369
if (_body_text) {
363370
try {
364-
bodytext_encoding = ced(Buffer.from(_body_text));
365-
// bodytext_encoding = detectCharacterEncoding(Buffer.from(_body_text));
371+
bodytext_encoding = detectCharacterEncoding(Buffer.from(_body_text));
372+
// _log_module && console.log('!'.repeat(100))
373+
// _log_module && console.log(bodytext_encoding)
374+
// bodytext_encoding = ced(Buffer.from(_body_text));
375+
_log_module && console.log('!'.repeat(100))
376+
_log_module && console.log(bodytext_encoding)
377+
_log_module && console.log('!'.repeat(100))
366378
} catch(e) {}
367379
}
368380
// var bodytext_encoding = _body_text ? detectCharacterEncoding(Buffer.from(_body_text)) : {};
@@ -375,8 +387,7 @@ const EmailBodyUtility = function() {
375387
var body_text_encoded_encoding = {};
376388
if (_body_text_encoded) {
377389
try {
378-
body_text_encoded_encoding = ced(Buffer.from(_body_text_encoded));
379-
// body_text_encoded_encoding = detectCharacterEncoding(Buffer.from(_body_text_encoded));
390+
body_text_encoded_encoding = detectCharacterEncoding(Buffer.from(_body_text_encoded));
380391
} catch(e) {}
381392
}
382393
// var body_text_encoded_encoding = _body_text_encoded ? detectCharacterEncoding(Buffer.from(_body_text_encoded)) : {};
@@ -702,4 +713,4 @@ const EmailBodyUtility = function() {
702713
convertPlainTextToHtml // (text)
703714
};
704715
}();
705-
module.exports = EmailBodyUtility;
716+
module.exports = EmailBodyUtility;

index.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,8 +283,8 @@ exports.queue_to_mongodb = function(next, connection) {
283283
email.extracted_html_from = body_info.meta.html_source;
284284
email.extracted_text_from = body_info.meta.text_source;
285285
// Add html into email
286-
email.html = _body_html || body_info.html;
287-
email.text = _body_text || body_info.text;
286+
email.html = body_info.html;
287+
email.text = body_info.text;
288288
// Check for inline images
289289
_checkInlineImages(plugin, email, function(error, email) {
290290
// Return

package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"async": "^3.2.3",
3434
"ced": "^2.0.0",
3535
"decode-html": "^2.0.0",
36+
"detect-character-encoding": "^0.8.0",
3637
"fs-extra": "^10.1.0",
3738
"iconv": "^3.0.1",
3839
"ioredis": "^5.0.4",
@@ -41,7 +42,7 @@
4142
"mailsplit": "^5.3.2",
4243
"mime": "^3.0.0",
4344
"moment": "^2.29.3",
44-
"mongodb": "4",
45+
"mongodb": "^4.5.0",
4546
"node-gyp": "^9.0.0",
4647
"nodemailer": "^6.7.5",
4748
"quoted-printable": "^1.0.1",
@@ -51,7 +52,7 @@
5152
"watch": "^1.0.2"
5253
},
5354
"devDependencies": {
54-
"eslint": "8",
55+
"eslint": "^8.15.0",
5556
"eslint-plugin-haraka": "^1.0.14",
5657
"haraka-test-fixtures": "^1.0.33",
5758
"mocha": "*"

pnpm-lock.yaml

Lines changed: 40 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)