sascsy Aug 3, 2009 at 3:24 AM Hi everyone,yesterday when i fix these two bug met few problems.Now i fixed two by different way.I hope this helps. UNIT TEST(If any failure please try DNS:218.85.152.99) ``` [TestMethod()] public void EncodingTest() { HtmlWeb target = new HtmlWeb(); target.AutoDetectEncoding = true; Encoding expected, actual; string url; HtmlDocument doc; //target page encoding by header url = "http://www.google.cn"; expected = Encoding.GetEncoding("gb2312"); doc = target.Load(url); actual = doc.Encoding; Assert.AreEqual(expected, actual); //target page encoding by content url = "http://bt.popgo.net/"; expected = Encoding.GetEncoding("gb2312"); doc = target.Load(url); actual =doc.Encoding; Assert.AreEqual(expected, actual); //target page encoding by default encoding url = "http://homepage.mac.com/joeobrin/classic/fat.html"; expected = Encoding.Default; doc = target.Load(url); actual = doc.Encoding; Assert.AreEqual(expected, actual); } [TestMethod()] public void HtmlNamedEntitiesTest() { HtmlWeb target = new HtmlWeb(); HtmlDocument.OptionEncedeHtmlEntities = true;//replace named entities like " " to " " string[] lines=File.ReadAllLines(@"D:\dic.txt");//line example:¡,¡(each line left is real item,right is named entities except 5 xml named entities,splited by ,) IDictionary htmlNamedEntities = new Dictionary(); foreach(string line in lines) { string key=line.Substring(line.IndexOf(",") + 1); string value= line.Substring(0, line.IndexOf(",")); if (!htmlNamedEntities.ContainsKey(key)) htmlNamedEntities.Add(key,value); } HtmlDocument.htmlSpecNamedEntities = htmlNamedEntities; HtmlDocument doc; string url; target.AutoDetectEncoding = true; //target cdata contains the char '"' and not contains   url = "http://bt.popgo.net/"; doc = target.Load(url); doc.OptionOutputAsXml = true; HtmlNode cdata = doc.DocumentNode.SelectSingleNode("*//body/div/script"); Assert.AreEqual(cdata.WriteTo().Contains("\""), true); Assert.AreNotEqual(cdata.InnerText.Contains(@" "), true); //target page contains unicode © url = "http://tlt.its.psu.edu/suggestions/international/web/codehtml.html"; doc = target.Load(url); doc.OptionOutputAsXml = true; string content = doc.DocumentNode.WriteTo(); Assert.AreEqual(content.Contains("©"), true); } PATHS:(how can i upload files??)``` `HtmlDocument.CS` ```@@ -41,6 +41,9 @@ private Crc32 _crc32 = null; private bool _onlyDetectEncoding = false; + public static System.Collections.Generic.IDictionary htmlSpecNamedEntities;//for fix entities + public static bool OptionEncedeHtmlEntities = false;//for fix entities + // public props /// @@ -219,8 +222,8 @@ throw new ArgumentNullException("html"); } // replace & by & but only once! - Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase); - return rx.Replace(html, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """); + Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;)|(apos;))", RegexOptions.IgnoreCase);//Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase); + return rx.Replace(html, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """).Replace("'","'");//return rx.Replace(html, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """); } /// @@ -581,6 +584,17 @@ _declaredencoding = null; _text = reader.ReadToEnd(); + + if (OptionEncedeHtmlEntities)//for fix entities + { + if (htmlSpecNamedEntities == null) + throw new ArgumentNullException("HtmlSpecEntitiesMustAssignedBeforeConvertToXml"); + foreach (var item in htmlSpecNamedEntities) + { + _text = _text.Replace(item.Key, item.Value); + } + } + _documentnode = CreateNode(HtmlNodeType.Document, 0); Parse(); HTMLWEB.cs @@ -472,7 +472,30 @@ } else { - doc.Load(s, true); + //doc.Load(s, true); + if (AutoDetectEncoding&&(DefaultEncoding==null))//attempt to get encoding from web + { + try//attempt to get encoding from response headers + { + var rx = new System.Text.RegularExpressions.Regex("charset=(.+)"); + var encodingStr = rx.Match(resp.Headers["Content-Type"]).Groups[1].Value; + DefaultEncoding = System.Text.Encoding.GetEncoding(encodingStr); + } + catch { } + if (DefaultEncoding == null)//if previous fail,attempt to get encoding from html content + { + var _detReq = WebRequest.Create(uri) as HttpWebRequest; + var _detResp = _detReq.GetResponse(); + using (var _detRespStream = _detResp.GetResponseStream()) + { + DefaultEncoding = doc.DetectEncoding(_detRespStream); + } + _detResp.Close(); + } + if (DefaultEncoding == null)//if previous fail,get utf-8 as default encoding + DefaultEncoding = System.Text.Encoding.Default; + } + doc.Load(s, DefaultEncoding, true); } } } @@ -803,5 +826,18 @@ _usingCache = value; } } + + private System.Text.Encoding _defaultEncoding; + public System.Text.Encoding DefaultEncoding + { + get + { + return _defaultEncoding; + } + set + { + _defaultEncoding = value; + } + } } } HTMLNODE.cs @@ -1552,7 +1552,13 @@ html = ((HtmlTextNode)this).Text; if (_ownerdocument.OptionOutputAsXml) { - outText.Write(HtmlDocument.HtmlEncode(html)); + if (_cdataText)//fix cdata + { + outText.Write(html); + _cdataText = false;//fix cdata + } + else//fix cdata + outText.Write(HtmlDocument.HtmlEncode(html)); } else { @@ -1755,6 +1761,7 @@ sw.Flush(); return sw.ToString(); } + internal static bool _cdataText = false;//fix cdata } } ```