Detect and convert html body in the correct charset before parsing it

Signed-off-by: Thomas Citharel <tcit@tcit.fr>
This commit is contained in:
Thomas Citharel
2022-01-18 12:47:45 +01:00
parent c8735e5837
commit fbe5a8d0c4
4 changed files with 83 additions and 0 deletions

View File

@@ -74,6 +74,7 @@ defmodule Mobilizon.Service.RichMedia.Parser do
{:is_html, _response_headers, true} <-
{:is_html, response_headers, is_html(response_headers)} do
body
|> convert_utf8(response_headers)
|> maybe_parse()
|> Map.put(:url, url)
|> maybe_add_favicon()
@@ -317,4 +318,78 @@ defmodule Mobilizon.Service.RichMedia.Parser do
defp default_user_agent(_url) do
Config.instance_user_agent()
end
defp convert_utf8(body, headers) do
headers
|> get_header("Content-Type")
|> handle_charset(body)
end
defp handle_charset(nil, body) do
case detect_charset_from_meta(body) do
"" -> body
nil -> body
charset -> convert_body(body, charset)
end
end
defp handle_charset(content_type, body) do
case charset_from_content_type(content_type) do
nil -> handle_charset(nil, body)
charset -> convert_body(body, charset)
end
end
defp charset_from_content_type(content_type) do
with [_, params] <- :binary.split(content_type, ";"),
%{"charset" => charset} <- Utils.params(params) do
charset
else
_ -> nil
end
end
defp detect_charset_from_meta(body) do
Logger.debug("Trying to detect charset from meta")
document = Floki.parse_document!(body)
case document
|> Floki.find("meta[http-equiv=\"content-type\"]")
|> List.first() do
nil ->
case document
|> Floki.find("meta[http-equiv=\"Content-Type\"]")
|> List.first() do
nil -> nil
meta -> content_type_from_meta(meta)
end
meta ->
content_type_from_meta(meta)
end
end
defp content_type_from_meta(meta) do
Logger.debug("Finding content-type into <meta> element")
meta
|> Floki.attribute("content")
|> List.first()
|> String.trim()
|> charset_from_content_type()
end
defp convert_body(body, "utf-8"), do: body
defp convert_body(body, charset) do
Logger.debug("Converting body from #{charset}")
Codepagex.to_string!(body, fix_charset(charset))
end
defp fix_charset("windows-1252"), do: :"VENDORS/MICSFT/WINDOWS/CP1252"
defp fix_charset(charset) do
String.replace(charset, "-", "_")
end
end