NodeCollection Returns Null

Topics: Developer Forum, Project Management Forum, User Forum
Jul 24, 2009 at 10:56 PM

Hi everyone,

I am new to asp.net. I am trying to use the Html Agility Pack, but it always returns null. Please see my codes below.

            HtmlWeb hw = new HtmlWeb();

            HtmlDocument doc = hw.Load(strURL);

            HtmlNodeCollection nodecoll = doc.DocumentNode.SelectNodes("//div[@class='productTitle']");

my nodecoll always returns null.

Any experts please help.

Thanks,

 

HtmlWeb hw = new HtmlWeb();
            HtmlDocument doc = hw.Load(strURL);
            //string path = "//td[@class='details']";
            HtmlNodeCollection nodecollHtmlWeb hw = new HtmlWeb();
            HtmlDocument doc = hw.Load(strURL);
            //string path = "//td[@class='details']";
            HtmlNodeCollection nodecoll = doc.DocumentNode.SelectNodes("//div[@class='productTitle']"); = doc.DocumentNode.SelectNodes("//div[@class='productTitle']");

 

 

Jul 25, 2009 at 1:36 PM

Based on your code, this is what I would expect to be a problem

  1. There are no div element(s) with a class attribute with value "productTitle".
  2. The source HTML (from strURL) is malformed. Eg.: <script>-tags ends with /> and not </script>. This causes everything after  the (first?) opening script-tag to be swallowed. The script tag will be closed by HtmlAgilityPack, and any other element before the script-tag is also closed.
  3. The webserver doesnt accept HtmlWeb's request. Eg.: the server script requires the User-Agent header or another header to be set, and gives you some other HTML instead of the HTML you expect.

You can use this class to save whatever HtmlDocument think it sees.

using System;
using System.IO;
using System.Text;
using System.Xml;
using HtmlAgilityPack;

public static class HtmlAgilityPackHelper
{
    #region Public static methods: Save(HtmlDocument), Save(HtmlNode) + overloads.

    public static void Save( string path, HtmlDocument document )
    {
        Save( path, document, false );
    }

    /// <summary>
    /// Saves the <see cref="HtmlDocument"/> to the specified path.
    /// </summary>
    /// <param name="path">The file to write to.</param>
    /// <param name="document">The <see cref="HtmlDocument"/> to save.</param>
    /// <param name="asXml">if set to <c>true</c> save document as XML.</param>
    /// <seealso cref="HtmlNode"/>
    /// <see cref="HtmlDocument"/>
    public static void Save( string path, HtmlDocument document, bool asXml )
    {
        if( path == null || document == null )
        {
            DebugBreakOrThrow( "Figure out why " + (path == null ? "path" : "document") + "is null", new ArgumentNullException( path == null ? "path" : "document" ) );
        }

        // DeclaredEncoding is set if document.OptionReadEncoding is true, and a valid Content-Type is found.
        // When the OptionReadEncoding is true, HtmlDocument tries to find the encoding by reading the <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />.
        // If it's null, we use the default.
        using( FileStream fileStream = new FileStream( path, FileMode.Create, FileAccess.Write, FileShare.Read, 1024 ) )
        {
            using( StreamWriter streamWriter = new StreamWriter( fileStream, document.DeclaredEncoding ?? document.Encoding ) )
            {
                if( asXml )
                {
                    XmlWriterSettings settings = new XmlWriterSettings();
                    settings.Encoding = document.DeclaredEncoding ?? document.Encoding;
                    settings.Indent = true;
                    settings.CloseOutput = true;
                    settings.OmitXmlDeclaration = true;
                    settings.NewLineOnAttributes = true;
                    settings.CheckCharacters = true;

                    settings.ConformanceLevel = ConformanceLevel.Fragment;

                    document.Save( XmlWriter.Create( streamWriter, settings ) );
                }
                else
                {
                    document.Save( streamWriter );
                }

                streamWriter.Flush();
            }
        }
    }

    /// <summary>
    /// Saves the <see cref="HtmlDocument"/> to the specified path.
    /// </summary>
    /// <param name="path">The file to write to.</param>
    /// <param name="node">The <see cref="HtmlNode"/> to save.</param>
    /// <remarks>This method will use the <see cref="HtmlNode.OwnerDocument">HtmlNode.OwnerDocument</see> to determine which encoding to use.</remarks>
    /// <seealso cref="Save(string, HtmlNode, Encoding)"/>
    /// <seealso cref="HtmlNode"/>
    /// <see cref="HtmlDocument"/>
    public static void Save( string path, HtmlNode node )
    {
        if( path == null || node == null )
        {
            DebugBreakOrThrow( "Figure out why " + (path == null ? "path" : "node") + "is null", new ArgumentNullException( path == null ? "path" : "node" ) );
        }

        Save( path, node, node.OwnerDocument.DeclaredEncoding ?? node.OwnerDocument.Encoding );
    }

    /// <summary>
    /// Saves the <see cref="HtmlDocument"/> to the specified path.
    /// </summary>
    /// <param name="path">The file to write to.</param>
    /// <param name="node">The <see cref="HtmlNode"/> to save.</param>
    /// <param name="encoding">The encoding to use when writing the file.</param>
    /// <seealso cref="HtmlNode"/>
    /// <see cref="HtmlDocument"/>
    public static void Save( string path, HtmlNode node, Encoding encoding )
    {
        if( path == null || node == null )
        {
            DebugBreakOrThrow( "Figure out why " + (path == null ? "path" : "node") + "is null", new ArgumentNullException( path == null ? "path" : "node" ) );
        }

        // Will only be triggered if the caller isn't called by another Save().
        if( encoding == null )
        {
            DebugBreakOrThrow( "Figure out why encoding is null.", new ArgumentNullException( "encoding" ) );
        }

        using( FileStream fileStream = new FileStream( path, FileMode.Create, FileAccess.Write, FileShare.Read, 1024 ) )
        {
            using( StreamWriter streamWriter = new StreamWriter( fileStream, encoding ) )
            {
                // StreamWriter is a TextWriter, so we can pass it to HtmlNode.WriteContentTo(TextWriter).
                // So node.WriteTo only saves the current node, which is only useful if the node has no children.
                // node.WriteContentTo, only saves the current nodes children, therefore if the current node has a parent, we use the parent to save.
                // this will include all siblings aswell :S
                if( node.ParentNode != null )
                {
                    node.ParentNode.WriteContentTo( streamWriter );
                }
                else if( node.HasChildNodes == false )
                {
                    node.WriteTo( streamWriter );
                }
                else if( node.Name == HtmlNode.HtmlNodeTypeNameDocument )
                {
                    node.WriteContentTo( streamWriter );
                }
                else
                {
                    // TODO: Properly save parent-less node with children.
                    DebugBreakOrThrow( "Properly save parent-less node with children. Inspect 'node'.", new InvalidOperationException( "Don't know how to save the node, and it's children!" ) );
                }

                streamWriter.Flush();
            }
        }
    }
    #endregion

    private static void DebugBreakOrThrow( string What_needs_to_be_checked_at_this_point, Exception exception )
    {
        if( System.Diagnostics.Debugger.IsAttached )
        {
            // Open the Call-stack toolwindow: Debug -> Windows -> Call Stack. Double-click the next item in the list.
            System.Diagnostics.Debugger.Log( 1, "HtmlAgilityPackHelper -> DebugBreakOrThrow()", "Supplied message: " + What_needs_to_be_checked_at_this_point ?? "None-supplied" );
            System.Diagnostics.Debugger.Break();
        }
        else
        {
            throw exception;
        }
    }
}
You can use it like this:
static void Main(string[] args)
{
    HtmlWeb hw = new HtmlWeb();

    HtmlDocument doc = hw.Load( strURL );
    HtmlAgilityPackHelper.Save( "parsed-html.txt", doc );
    MessageBox.Show("HTML is located: " + new FileInfo("parsed-html.txt").FullName +);

    HtmlNodeCollection nodecoll = doc.DocumentNode.SelectNodes( "//div[@class='productTitle']" );
}
Jul 27, 2009 at 4:36 PM

thank you so much Coolspin,

I will put this code in my project and try it. I will let you know how it goes.

Thanks again,

Jul 27, 2009 at 4:59 PM

Hi CoolSpin,

After I run your code, the save file does not show [@class='productTitle']. I guess it should be the problem #3 as you mentioned earlier.

Could you please explain how to setup the header?

Thanks,

 

 

 

Aug 2, 2009 at 11:58 AM

The method that creates the WebRequest object is the private method

private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc)

You can set custom headers on the req-object's Headers-property:

req.Headers.Add(HttpRequestHeader.UserAgent, "Hello World Browser/1.0.0.0");

You can also use WebRequest.Create(...) yourself, and pass the GetResponseStream()  on the HttpWebResponse object returned from req.GetResponse(), to HtmlDocument.Load(stream);. But then you must yourself check for Content-Type headers and if they are present, you should tell the Load() method what encoding to use. You should assume the server knows what it's sending, unless overridden inside the HTML with a META HTTP-EQUIV="Content-Type".


Aug 3, 2009 at 5:32 PM

Thank you so much CoolSpin for the example. I will try to apply this codes and let you know how it works out.

 

Thanks again for a reply.

 

 

Sep 11, 2009 at 11:45 AM

Hi, all.

I think it is not good practice to return null.

Empty collection pattern is better.

Feb 5, 2010 at 9:51 AM
Edited Feb 5, 2010 at 9:55 AM

best practice to have a generic null of the node collection type. to avoid certain scenarios where exceptions can be thrown.  but not in all cases would i not want to have an exception thrown.

lets take this case for example, i need to fix this code...  nevertheless, the xpath is right.  i also added the correct http header in my object DownloadHtml.  i bolded the line where the error occurs.

System.ArgumentNullException was unhandled
  Message="Value cannot be null.\r\nParameter name: source"
  Source="System.Core"
  ParamName="source"
  StackTrace:
       at System.Linq.Enumerable.Select[TSource,TResult](IEnumerable`1 source, Func`2 selector)
       at GoogleTrends.Form1.button1_Click(Object sender, EventArgs e) in C:\Users\Daniel\Documents\Visual Studio 2008\Projects\GoogleTrends\GoogleTrends\Form1.cs:line 32
       at System.Windows.Forms.Control.OnClick(EventArgs e)
       at System.Windows.Forms.Button.OnClick(EventArgs e)
       at System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
       at System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
       at System.Windows.Forms.Control.WndProc(Message& m)
       at System.Windows.Forms.ButtonBase.WndProc(Message& m)
       at System.Windows.Forms.Button.WndProc(Message& m)
       at System.Windows.Forms.Control.ControlNativeWindow.OnMessage(Message& m)
       at System.Windows.Forms.Control.ControlNativeWindow.WndProc(Message& m)
       at System.Windows.Forms.NativeWindow.DebuggableCallback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
       at System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
       at System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(Int32 dwComponentID, Int32 reason, Int32 pvLoopData)
       at System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
       at System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
       at System.Windows.Forms.Application.Run(Form mainForm)
       at GoogleTrends.Program.Main() in C:\Users\Daniel\Documents\Visual Studio 2008\Projects\GoogleTrends\GoogleTrends\Program.cs:line 18
       at System.AppDomain._nExecuteAssembly(Assembly assembly, String[] args)
       at System.AppDomain.ExecuteAssembly(String assemblyFile, Evidence assemblySecurity, String[] args)
       at Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly()
       at System.Threading.ThreadHelper.ThreadStart_Context(Object state)
       at System.Threading.ExecutionContext.Run(ExecutionContext executionContext, ContextCallback callback, Object state)
       at System.Threading.ThreadHelper.ThreadStart()
  InnerException:

 

 

private void button1_Click(object sender, EventArgs e)
{
string xpath = @"/html/body/center/table/tbody/tr[3]/td/table/tbody/tr/td[3]/table[3]/tbody/tr/td/table/tbody/tr/td";
string url = @"http://www.google.com/trends";

string html = DownloadHtml.GetHtmlFromUrl(new Uri(url));


HtmlAgilityPack.HtmlDocument hdoc = new HtmlAgilityPack.HtmlDocument();
hdoc.LoadHtml(html);
HtmlAgilityPack.HtmlNodeCollection nodeCol = hdoc.DocumentNode.SelectNodes(xpath);

var result = from v in nodeCol
select new GTrends
{
myItem= v.WriteTo(),
Website="Google Trends"
};
dataGridView1.DataSource = result;
dataGridView1.Update();



}

 

 

 

 

 

 

-------

DownloadHtml.cs

---------

 

using System;
using System.Data;
using System.Configuration;
using System.Linq;
using System.Web;

using System.Xml.Linq;
using System.Net;
using System.Text;
using System.IO;



namespace GoogleTrends
{




///
/// Summary description for DownloadHtml
///
public static class DownloadHtml
{


public static string GetHtmlFromUrl(Uri url)
{
string html = string.Empty;
HttpWebRequest request = GenerateHttpWebRequest(url);
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
if (CategorizeResponse(response) == ResponseCategories.Success)
{
Stream responseStream = response.GetResponseStream();
using (StreamReader reader = new StreamReader(responseStream, Encoding.UTF8))
{
html = reader.ReadToEnd();
}
}
}
return html;
}

public static HttpWebRequest GenerateHttpWebRequest(Uri uri)
{

//all this mess below is my attempt to resolve some of the issues in taking on various conflicts in httpreqeust.
//code is left in
//if infact requests vary may need to switch(key) on differnet sites?

HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(uri);

httpRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0b; Windows NT 5.0)";

CookieContainer cc = new CookieContainer();
httpRequest.CookieContainer = cc;//must assing a cookie container for the request to pull the cookies

httpRequest.AllowAutoRedirect = true; //example, Hanes.com

httpRequest.Credentials = CredentialCache.DefaultCredentials;

httpRequest.Headers.Add("HTTP_USER_AGENT", @"Mozilla/5.0(PC) (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4");


// httpRequest.Headers.Add("Agent", "Mozilla/5.0(PC) (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4");
// httpRequest.Headers.Add("Accept-Charset", "ISO-8859-1");
/*

httpRequest.Headers.Add("Accept-Language", "en-us,en;q=0.5");
httpRequest.Headers.Add("Accept-Encoding", "gzip,deflate");
httpRequest.Headers.Add("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7");


// httpRequest.Headers.Add("Set-Cookie", response.Headers("Set-Cookie"));
httpRequest.Headers.Add("Agent", "Mozilla//5.0 (X11; U; Linux i686; en-US; ry; 1.8.0.7) Geck//20060925 Firefox//1.5.0.7");
*/


return httpRequest;
}

public static HttpWebRequest GenerateHttpWebRequest(Uri uri, string postData, string contentType)
{
HttpWebRequest httpRequest = GenerateHttpWebRequest(uri);

byte[] bytes = Encoding.UTF8.GetBytes(postData);

httpRequest.ContentLength = postData.Length;

using (Stream requestStream = httpRequest.GetRequestStream())
{
requestStream.Write(bytes, 0, bytes.Length);
}

return httpRequest;
}

public static HttpWebRequest AddProxyInfoToRequest(HttpWebRequest httpRequest, Uri proxyUri, string proxyId, string proxyPassword, string proxyDomain)
{
if (httpRequest != null)
{

WebProxy proxyInfo = new WebProxy();
proxyInfo.Address = proxyUri;
proxyInfo.BypassProxyOnLocal = true;
proxyInfo.Credentials = new NetworkCredential(proxyId, proxyPassword, proxyDomain);
httpRequest.Proxy = proxyInfo;

}
return httpRequest;
}

public static ResponseCategories CategorizeResponse(HttpWebResponse httpResponse)
{
//Just incase there are more success codes defined in the future by
// HttpStatusCode, We will checkf or the "success" ranges
// instead of using teh HttpStatusCode enum as it overloads some values.

int statusCode = (int)httpResponse.StatusCode;

if ((statusCode >= 100) && (statusCode <= 199))
{
return ResponseCategories.Informational;
}
else if ((statusCode >= 200) && (statusCode <= 299))
{
return ResponseCategories.Success;
}
else if ((statusCode >= 300) && (statusCode <= 399))
{
return ResponseCategories.Redirected;
}
else if ((statusCode >= 400) && (statusCode <= 499))
{
return ResponseCategories.ClientError;
}
else if ((statusCode >= 500) && (statusCode <= 599))
{
return ResponseCategories.ServerError;
}
return ResponseCategories.Unknown;
}


public enum ResponseCategories
{
Unknown, // Unknown code ( < 100 or > 599 )
Informational, //Informational codes (100 >=199)
Success, // success codes (200 >= 299)
Redirected, //Redirection code (300, 399)
ClientError, // Client error code (400 >= 499)
ServerError//Server Error Code (500,599 )

}


}
}

 

 

----------------

something i might be missing?