First, it's not clear to me that this is a bug or just a difference in expected behavior since I'm trying to use Floki to parse XML rather than HTML.
The issue is that CDATA inside of a <title> tag is not handled.
I would expect it to behave like other tags.
I recognize that this is maybe an artifact of trying to use the library for the wrong purpose (parsing XML) so no problem if you want to close this as "won't fix" or whatever.
If you do want to fix it, here's a patch that appears to do it.
diff --git a/src/floki_mochi_html.erl b/src/floki_mochi_html.erl
index d4e3337..4096161 100644
--- a/src/floki_mochi_html.erl
+++ b/src/floki_mochi_html.erl
@@ -301,13 +301,10 @@ tokens(B, S=#decoder{offset=O}, Acc) ->
{Tag2, S2} = tokenize_script(B, S1),
tokens(B, S2, [Tag2, Tag | Acc]);
style ->
{Tag2, S2} = tokenize_style(B, S1),
tokens(B, S2, [Tag2, Tag | Acc]);
- title ->
- {Tag2, S2} = tokenize_title(B, S1),
- tokens(B, S2, [Tag2, Tag | Acc]);
textarea ->
{Tag2, S2} = tokenize_textarea(B, S1),
tokens(B, S2, [Tag2, Tag | Acc]);
none ->
tokens(B, S1, [Tag | Acc])
@@ -318,12 +315,10 @@ parse_flag({start_tag, B, _, false}) ->
case string:to_lower(binary_to_list(B)) of
"script" ->
script;
"style" ->
style;
- "title" ->
- title;
"textarea" ->
textarea;
_ ->
none
end;
@@ -822,32 +817,10 @@ tokenize_style(Bin, S=#decoder{offset=O}, Start) ->
tokenize_style(Bin, ?INC_CHAR(S, C), Start);
<<_:Start/binary, Raw/binary>> ->
{{data, Raw, false}, S}
end.
-tokenize_title(Bin, S=#decoder{offset=O}) ->
- tokenize_title(Bin, S, O).
-
-tokenize_title(Bin, S=#decoder{offset=O}, Start) ->
- case Bin of
- %% Just a look-ahead, we want the end_tag separately
- <<_:O/binary, $<, $/, TT, II, TT2, LL, EE, ZZ, _/binary>>
- when (TT=:= $t orelse TT =:= $T) andalso
- (II=:= $i orelse II =:= $I) andalso
- (TT2=:= $t orelse TT2 =:= $T) andalso
- (LL=:= $l orelse LL =:= $L) andalso
- (EE=:= $e orelse EE =:= $E) andalso
- ?PROBABLE_CLOSE(ZZ) ->
- Len = O - Start,
- <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- {{data, Raw, false}, S};
- <<_:O/binary, C, _/binary>> ->
- tokenize_title(Bin, ?INC_CHAR(S, C), Start);
- <<_:Start/binary, Raw/binary>> ->
- {{data, Raw, false}, S}
- end.
-
tokenize_textarea(Bin, S=#decoder{offset=O}) ->
tokenize_textarea(Bin, S, O).
tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
case Bin of
Description
First, it's not clear to me that this is a bug or just a difference in expected behavior since I'm trying to use Floki to parse XML rather than HTML.
The issue is that CDATA inside of a <title> tag is not handled.
To Reproduce
Steps to reproduce the behavior:
Expected behavior
I would expect it to behave like other tags.
I recognize that this is maybe an artifact of trying to use the library for the wrong purpose (parsing XML) so no problem if you want to close this as "won't fix" or whatever.
Patch
If you do want to fix it, here's a patch that appears to do it.