This project has moved and is read-only. For the latest updates, please go here.
2
Vote

Recursive algorithms can result in stack overflows

description

I've rewritten .InnerText, .WriteTo() and .CloseNode() to use iterative rather than recursive algorithms to improve the stability of HAP when parsing untrusted code. I've attached the patch we're using. It's based against 1.3.0 (which is what we're using). I can work on porting those changes to tip if you'd be willing to accept them, but as you can see if you look at the patch the changes are quite extensive.

Here are some tests (written using nunit) that trigger the crashes we were experiencing:
private void CrashTest(Action action) {
    var aspNetStackSize = 256 * 1000;
    var thread = new Thread(() => action(), aspNetStackSize);
    thread.Start();
    thread.Join();
}

[Test]
public void Test_Unclosed_Nodes_Do_Not_Stackoverflow_Even_If_The_Dom_Is_Deep() {
    var spans = Enumerable.Repeat("<span>", 5000);
    var unclosed = String.Format("<div>{0}</div>", String.Join("", spans));
    CrashTest(() => {
        var doc = new HtmlAgilityPack.HtmlDocument();
        doc.LoadHtml(unclosed);
    });
}

[Test]
public void Test_InnerOuter_Does_Not_Stackoverflow_Even_If_The_Dom_Is_Deep() {
    var deep = String.Join("",
        Enumerable.Repeat("<span>", 5000)
            .Concat(Enumerable.Repeat("</span>", 5000))
    );

    CrashTest(() => {
        var doc = new HtmlAgilityPack.HtmlDocument();
        doc.LoadHtml(deep);

        var innner = doc.DocumentNode.InnerText;
        var outerHtml = doc.DocumentNode.OuterHtml;
        var innerHtml = doc.DocumentNode.InnerHtml;
    });
}

[Test]
public void Test_Save_Does_Not_Stackoverflow_Even_If_The_Dom_Is_Deep() { 
    var deep = String.Join("",
        Enumerable.Repeat("<div><span>", 1000)
            .Concat(Enumerable.Repeat("</span></div>", 1000))
    );

    CrashTest(() => {
        var doc = new HtmlAgilityPack.HtmlDocument();
        var writer = new StringWriter();

        doc.LoadHtml(deep);
        doc.Save(writer);
    });
}

file attachments

comments

Aaron_Maenpaa wrote Mar 20, 2014 at 2:58 PM

wrote Mar 21, 2014 at 12:28 AM