-
Notifications
You must be signed in to change notification settings - Fork 272
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Choose Q- or B-Encoding depending on percentage of printable characters * Encode all characters not listed in RFC2047 Section 5 (3) in Q-Encoding * Encode Spaces as _ in Q-Encoding * Fix header decoding for trailing _ in Q-Encoded text Co-Authored-By: Jan Uhlig <[email protected]>
- Loading branch information
1 parent
fe4f164
commit a7e7d96
Showing
2 changed files
with
186 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -227,7 +227,7 @@ tokenize_header(Value, Acc) -> | |
case Type of | ||
<<"q">> -> | ||
%% RFC 2047 #5. (3) | ||
decode_quoted_printable(re:replace(Data, "_", " ", [{return, binary}, global])); | ||
decode_quoted_printable(re:replace(Data, "_", "=20", [{return, binary}, global])); | ||
<<"b">> -> | ||
decode_base64(re:replace(Data, "_", " ", [{return, binary}, global])) | ||
end, | ||
|
@@ -1012,67 +1012,151 @@ fix_encoding(Encoding) -> | |
|
||
%% @doc Encode a binary or list according to RFC 2047. Input is | ||
%% assumed to be in UTF-8 encoding bytes; not codepoints. | ||
rfc2047_utf8_encode(undefined) -> undefined; | ||
rfc2047_utf8_encode(B) when is_binary(B) -> | ||
rfc2047_utf8_encode(binary_to_list(B)); | ||
rfc2047_utf8_encode([]) -> | ||
[]; | ||
rfc2047_utf8_encode(Text) -> | ||
%% Don't escape when all characters are ASCII printable | ||
case is_ascii_printable(Text) of | ||
'true' -> Text; | ||
'false' -> rfc2047_utf8_encode(Text, lists:reverse("=?UTF-8?Q?"), 10, []) | ||
end. | ||
|
||
rfc2047_utf8_encode(T, Acc, WordLen, Char) when WordLen + length(Char) > 73 -> | ||
CloseLine = lists:reverse("?=\r\n "), | ||
NewLine = Char ++ lists:reverse("=?UTF-8?Q?"), | ||
%% Make sure that the individual encoded words are not longer than 76 chars (including charset etc) | ||
rfc2047_utf8_encode(T, NewLine ++ CloseLine ++ Acc, length(NewLine), []); | ||
|
||
rfc2047_utf8_encode([], Acc, _WordLen, Char) -> | ||
lists:reverse("=?" ++ Char ++ Acc); | ||
|
||
%% Printable ASCII characters dont encode except space, ?, _, = and . | ||
rfc2047_utf8_encode([C|T], Acc, WordLen, Char) when C > 32 andalso C < 127 andalso C /= 32 | ||
andalso C /= $? andalso C /= $_ andalso C /= $= andalso C /= $. -> | ||
rfc2047_utf8_encode(T, Char ++ Acc, WordLen+length(Char), [C]); | ||
%% Encode all other ASCII | ||
rfc2047_utf8_encode([C|T], Acc, WordLen, Char) when C > 0 andalso C =< 192 -> | ||
rfc2047_utf8_encode(T, Char ++ Acc, WordLen+length(Char), encode_byte(C)); | ||
%% First byte of UTF-8 sequence | ||
%% ensure that encoded 2-4 byte UTF-8 characters kept in one line | ||
rfc2047_utf8_encode([C|T], Acc, WordLen, Char) when C > 192 andalso C =< 247 -> | ||
UTFBytes = utf_char_bytes(C), | ||
{Rest, ExtraUTFBytes} = encode_extra_utf_bytes(UTFBytes-1, T), | ||
rfc2047_utf8_encode(Rest, Char ++ Acc, WordLen+length(Char), ExtraUTFBytes ++ encode_byte(C)). | ||
|
||
is_ascii_printable([]) -> 'true'; | ||
is_ascii_printable([H|T]) when H >= 32 andalso H =< 126 -> | ||
|
||
% Characters allowed to appear unencoded (RFC 2047 Sections 4.2 and 5): | ||
% * lowercase ASCII letters | ||
% * uppercase ASCII letters | ||
% * decimal digits | ||
% * "!" | ||
% * "*" | ||
% * "+" | ||
% * "-" | ||
% * "/" | ||
% SPACE is not really an allowed letter, but since it encodes to "_" | ||
% and thereby a single byte, we list it as allowed here | ||
-define(is_rfc2047_q_allowed(C), (C=:=$\s orelse (C>=$a andalso C=<$z) orelse (C>=$A andalso C=<$Z) | ||
orelse (C>=$0 andalso C=<$9) orelse C=:=$! orelse C=:=$* orelse C=:=$+ | ||
orelse C=:=$- orelse C=:=$/)). | ||
|
||
rfc2047_utf8_encode(Value) when is_binary(Value) -> | ||
case is_ascii_printable(Value) of | ||
true -> | ||
% don't encode if all characters are printable ASCII | ||
Value; | ||
false -> | ||
Size = byte_size(Value), | ||
FilteredSize = byte_size(<< <<X>> || <<X>> <= Value, ?is_rfc2047_q_allowed(X) >>), | ||
Enc = if | ||
FilteredSize >= Size-FilteredSize -> | ||
% at least 50% of the value would be readable in Q-Encoding, | ||
% so we use it | ||
q; | ||
true -> | ||
% less than 50% of the value would be readable in Q-Encoding, | ||
% so we use B-Encoding instead | ||
b | ||
end, | ||
rfc2047_utf8_encode(Enc, Value, <<>>) | ||
end; | ||
rfc2047_utf8_encode(Value) -> | ||
rfc2047_utf8_encode(list_to_binary(Value)). | ||
|
||
rfc2047_utf8_encode(_Enc, <<>>, Acc) -> | ||
Acc; | ||
rfc2047_utf8_encode(b, More, Acc) -> | ||
% B-Encoding | ||
% An encoded word must not be longer than 75 bytes, | ||
% including the leading "=?", charset name, "?B?" and | ||
% the trailing "?=". Since the charset name is fixed to | ||
% "UTF-8", 63 remain for encoded text. Using Base64, | ||
% a maximum of 45 raw bytes can be encoded in 63 bytes. | ||
rfc2047_utf8_encode(b, More, Acc, <<>>, 45); | ||
rfc2047_utf8_encode(q, More, Acc) -> | ||
% Q-Encoding | ||
% An encoded word must not be longer than 75 bytes, | ||
% including the leading "=?", charset name, "?B?" and | ||
% the trailing "?=". Since the charset name is fixed to | ||
% "UTF-8", 63 remain for encoded text. Using Quoted-Printable, | ||
% between 21 and 63 raw bytes can be encoded in 63 bytes. | ||
rfc2047_utf8_encode(q, More, Acc, <<>>, 63). | ||
|
||
rfc2047_utf8_encode(Enc, <<>>, Acc, WordAcc, _Left) -> | ||
rfc2047_append_word(Acc, WordAcc, Enc); | ||
rfc2047_utf8_encode(Enc, All = <<2#11110:5, Rest:27, More/binary>>, Acc, WordAcc, Left) -> | ||
% A 4-byte UTF-8 sequence | ||
Reqd = case Enc of | ||
q -> 12; | ||
b -> 4 | ||
end, | ||
case Left >= Reqd of | ||
true -> | ||
rfc2047_utf8_encode(Enc, More, Acc, <<WordAcc/binary, 2#11110:5, Rest:27>>, Left-4); | ||
false -> | ||
rfc2047_utf8_encode(Enc, All, rfc2047_append_word(Acc, WordAcc, Enc)) | ||
end; | ||
rfc2047_utf8_encode(Enc, All = <<2#1110:4, Rest:20, More/binary>>, Acc, WordAcc, Left) -> | ||
% A 3-byte UTF-8 sequence | ||
Reqd = case Enc of | ||
q -> 9; | ||
b -> 3 | ||
end, | ||
case Left >= Reqd of | ||
true -> | ||
rfc2047_utf8_encode(Enc, More, Acc, <<WordAcc/binary, 2#1110:4, Rest:20>>, Left-Reqd); | ||
false -> | ||
rfc2047_utf8_encode(Enc, All, rfc2047_append_word(Acc, WordAcc, Enc)) | ||
end; | ||
rfc2047_utf8_encode(Enc, All = <<2#110:3, Rest:13, More/binary>>, Acc, WordAcc, Left) -> | ||
% A 2-byte UTF-8 sequence | ||
Reqd = case Enc of | ||
q -> 9; | ||
b -> 3 | ||
end, | ||
case Left >= Reqd of | ||
true -> | ||
rfc2047_utf8_encode(Enc, More, Acc, <<WordAcc/binary, 2#110:3, Rest:13>>, Left-Reqd); | ||
false -> | ||
rfc2047_utf8_encode(Enc, All, rfc2047_append_word(Acc, WordAcc, Enc)) | ||
end; | ||
rfc2047_utf8_encode(Enc, All = <<C, More/binary>>, Acc, WordAcc, Left) -> | ||
% Not an UTF-8 character | ||
Reqd = case Enc of | ||
q when not ?is_rfc2047_q_allowed(C) -> 3; | ||
q -> 1; | ||
b -> 1 | ||
end, | ||
case Left >= Reqd of | ||
true -> | ||
rfc2047_utf8_encode(Enc, More, Acc, <<WordAcc/binary, C>>, Left-Reqd); | ||
false -> | ||
rfc2047_utf8_encode(Enc, All, rfc2047_append_word(Acc, WordAcc, Enc)) | ||
end. | ||
|
||
rfc2047_append_word(Acc, <<>>, _Enc) -> | ||
Acc; | ||
rfc2047_append_word(<<>>, Word, q) -> | ||
% first word in Acc | ||
<<"=?UTF-8?Q?", (rfc2047_q_encode(Word))/binary, "?=">>; | ||
rfc2047_append_word(<<>>, Word, b) -> | ||
% first word in Acc | ||
<<"=?UTF-8?B?", (base64:encode(Word))/binary, "?=">>; | ||
rfc2047_append_word(Acc, Word, q) -> | ||
% subsequent word in Acc, append LWSP and word | ||
<<Acc/binary, $\r, $\n, $\s, "=?UTF-8?Q?", (rfc2047_q_encode(Word))/binary, "?=">>; | ||
rfc2047_append_word(Acc, Word, b) -> | ||
% subsequent word in Acc, append LWSP and word | ||
<<Acc/binary, $\r, $\n, $\s, "=?UTF-8?B?", (base64:encode(Word))/binary, "?=">>. | ||
|
||
rfc2047_q_encode(<<>>) -> | ||
<<>>; | ||
rfc2047_q_encode(<<$\s, More/binary>>) -> | ||
% SPACE -> _ | ||
<<$_, (rfc2047_q_encode(More))/binary>>; | ||
rfc2047_q_encode(<<C, More/binary>>) when ?is_rfc2047_q_allowed(C) -> | ||
% character which needs no encoding | ||
<<C, (rfc2047_q_encode(More))/binary>>; | ||
rfc2047_q_encode(<<N1:4, N2:4, More/binary>>) -> | ||
% characters which need encoding -> =XY | ||
<<$=, (hex(N1)), (hex(N2)), (rfc2047_q_encode(More))/binary>>. | ||
|
||
is_ascii_printable(<<>>) -> 'true'; | ||
is_ascii_printable(<<H, T/binary>>) when H >= 32 andalso H =< 126 -> | ||
is_ascii_printable(T); | ||
is_ascii_printable(_) -> 'false'. | ||
|
||
encode_byte(C) -> [ hex(C rem 16), hex(C div 16), $= ]. | ||
hex(N) when N >= 10 -> N + $A - 10; | ||
hex(N) -> N + $0. | ||
|
||
%% https://en.wikipedia.org/wiki/UTF-8#Description | ||
%% 240 - 247 | ||
utf_char_bytes(C) when C >= 2#11110000 andalso C =< 2#11110111 -> 4; | ||
%% 224 - 239 | ||
utf_char_bytes(C) when C >= 2#11100000 andalso C =< 2#11101111 -> 3; | ||
%% 192 - 223 | ||
utf_char_bytes(C) when C >= 2#11000000 andalso C =< 2#11011111 -> 2; | ||
%% 0 - 127 (ASCII) | ||
utf_char_bytes(C) when C >= 2#00000000 andalso C =< 2#01111111 -> 1. | ||
|
||
encode_extra_utf_bytes(0, AccIn) -> {AccIn, []}; | ||
encode_extra_utf_bytes(Bytes, AccIn) -> encode_extra_utf_bytes(Bytes, AccIn, []). | ||
|
||
encode_extra_utf_bytes(0, AccIn, AccOut) -> {AccIn, AccOut}; | ||
encode_extra_utf_bytes(Bytes, [C|T], AccOut) when C >= 128 andalso C =< 191 -> | ||
encode_extra_utf_bytes(Bytes-1, T, encode_byte(C) ++ AccOut). | ||
|
||
%% @doc DKIM sign an email | ||
%% DKIM sign functions | ||
%% RFC 6376 | ||
|
@@ -2033,7 +2117,49 @@ rfc2047_decode_test_() -> | |
{"decode something I encoded myself", | ||
fun() -> | ||
A = <<"Jacek Złydach <[email protected]>"/utf8>>, | ||
?assertEqual(A, decode_header(list_to_binary(rfc2047_utf8_encode(A)), "utf-8")) | ||
?assertEqual(A, decode_header(rfc2047_utf8_encode(A), "utf-8")) | ||
end | ||
} | ||
]. | ||
|
||
rfc2047_utf8_encode_test_() -> | ||
[ | ||
{"Q-Encoding", | ||
fun() -> | ||
?assertEqual(<<"=?UTF-8?Q?abcdefghijklmnopqrstuvwxyz?=">>, rfc2047_utf8_encode(q, <<"abcdefghijklmnopqrstuvwxyz">>, <<>>)), | ||
?assertEqual(<<"=?UTF-8?Q?ABCDEFGHIJKLMNOPQRSTUVWXYZ?=">>, rfc2047_utf8_encode(q, <<"ABCDEFGHIJKLMNOPQRSTUVWXYZ">>, <<>>)), | ||
?assertEqual(<<"=?UTF-8?Q?0123456789?=">>, rfc2047_utf8_encode(q, <<"0123456789">>, <<>>)), | ||
?assertEqual(<<"=?UTF-8?Q?!*+-/?=">>, rfc2047_utf8_encode(q, <<"!*+-/">>, <<>>)), | ||
?assertEqual(<< "=?UTF-8?Q?This_text_encodes_to_more_than_63_bytes=2E_Therefore=2C_it_shou?=\r\n" | ||
" =?UTF-8?Q?ld_be_encoded_in_multiple_encoded_words=2E?=">>, | ||
rfc2047_utf8_encode(q, <<"This text encodes to more than 63 bytes. Therefore, it should be encoded in multiple encoded words.">>, <<>>)), | ||
?assertEqual(<< "=?UTF-8?Q?We_place_an_UTF8_4byte_character_over_the_breaking_point_here_?=\r\n" | ||
" =?UTF-8?Q?=F0=9F=80=84?=">>, | ||
rfc2047_utf8_encode(q, <<"We place an UTF8 4byte character over the breaking point here ", 16#F0, 16#9F, 16#80, 16#84>>, <<>>)) | ||
end | ||
}, | ||
{"B-Encoding", | ||
fun() -> | ||
?assertEqual(<<"=?UTF-8?B?U29tZSBzaG9ydCB0ZXh0Lg==?=">>, | ||
rfc2047_utf8_encode(b, <<"Some short text.">>, <<>>)), | ||
?assertEqual(<< "=?UTF-8?B?VGhpcyB0ZXh0IGVuY29kZXMgdG8gbW9yZSB0aGFuIDYzIGJ5dGVzLiBUaGVy?=\r\n" | ||
" =?UTF-8?B?ZWZvcmUsIGl0IHNob3VsZCBiZSBlbmNvZGVkIGluIG11bHRpcGxlIGVuY29k?=\r\n" | ||
" =?UTF-8?B?ZWQgd29yZHMu?=">>, | ||
rfc2047_utf8_encode(b, <<"This text encodes to more than 63 bytes. Therefore, it should be encoded in multiple encoded words.">>, <<>>)), | ||
?assertEqual(<< "=?UTF-8?B?AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKiss?=\r\n" | ||
" =?UTF-8?B?LS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZ?=\r\n" | ||
" =?UTF-8?B?WltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3x9fn8=?=">>, | ||
rfc2047_utf8_encode(b, << <<X>> || X <- lists:seq(0, 16#7F) >>, <<>>)), | ||
?assertEqual(<< "=?UTF-8?B?UGxhY2UgYW4gVVRGOCA0Ynl0ZSBjaGFyYWN0ZXIgYXQgdGhlIGJyZWFr?=\r\n" | ||
" =?UTF-8?B?8J+AhA==?=">>, | ||
rfc2047_utf8_encode(b, <<"Place an UTF8 4byte character at the break", 16#F0, 16#9F, 16#80, 16#84>>, <<>>)) | ||
end | ||
}, | ||
{"Pick encoding", | ||
fun() -> | ||
?assertEqual(<<"asdf">>, rfc2047_utf8_encode(<<"asdf">>)), | ||
?assertEqual(<<"=?UTF-8?Q?x=09?=">>, rfc2047_utf8_encode(<<"x\t">>)), | ||
?assertEqual(<<"=?UTF-8?B?CXgJ?=">>, rfc2047_utf8_encode(<<"\tx\t">>)) | ||
end | ||
} | ||
]. | ||
|
@@ -2074,7 +2200,7 @@ encoding_test_() -> | |
[{<<"charset">>,<<"US-ASCII">>}], | ||
disposition => <<"inline">>}, | ||
<<"This is a plain message">>}, | ||
Result = <<"Subject: =?UTF-8?Q?Fr=C3=A6derik=20H=C3=B8lljen?=\r\nFrom: =?UTF-8?Q?Fr=C3=A6derik=20H=C3=B8lljen?= <[email protected]>\r\nTo: [email protected]\r\nMessage-ID: <[email protected]>\r\nMIME-Version: 1.0\r\nDate: Sun, 01 Nov 2009 14:44:47 +0200\r\n\r\nThis is a plain message">>, | ||
Result = <<"Subject: =?UTF-8?Q?Fr=C3=A6derik_H=C3=B8lljen?=\r\nFrom: =?UTF-8?Q?Fr=C3=A6derik_H=C3=B8lljen?= <[email protected]>\r\nTo: [email protected]\r\nMessage-ID: <[email protected]>\r\nMIME-Version: 1.0\r\nDate: Sun, 01 Nov 2009 14:44:47 +0200\r\n\r\nThis is a plain message">>, | ||
?assertEqual(Result, encode(Email)) | ||
end | ||
}, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters