another patch for manual encoding and Entities are doubly-encoded

Topics: Developer Forum, User Forum
Aug 3, 2009 at 4:24 AM

Hi everyone,yesterday when i fix these two bug met few problems.Now i fixed two by different way.I hope this helps.

UNIT TEST(If any failure please try DNS:218.85.152.99)

  [TestMethod()]
        public void EncodingTest()
        {
            HtmlWeb target = new HtmlWeb();
            target.AutoDetectEncoding = true;
            Encoding expected, actual;
            string url;
            HtmlDocument doc;
            //target page encoding by header
            url = "http://www.google.cn";
            expected = Encoding.GetEncoding("gb2312");
            doc = target.Load(url);
            actual = doc.Encoding;
            Assert.AreEqual(expected, actual);
            //target page encoding by content
            url = "http://bt.popgo.net/";
            expected = Encoding.GetEncoding("gb2312");
            doc = target.Load(url);
            actual =doc.Encoding;
            Assert.AreEqual(expected, actual);
            //target page encoding by default encoding
            url = "http://homepage.mac.com/joeobrin/classic/fat.html";
            expected = Encoding.Default;
            doc = target.Load(url);
            actual = doc.Encoding;
            Assert.AreEqual(expected, actual);
        }
        [TestMethod()]
        public void HtmlNamedEntitiesTest()
        {
            HtmlWeb target = new HtmlWeb();
            HtmlDocument.OptionEncedeHtmlEntities = true;//replace named entities like " " to " " 
            string[] lines=File.ReadAllLines(@"D:\dic.txt");//line example:¡,¡(each line left is real item,right is named entities except 5 xml named entities,splited by ,)
            IDictionary<string, string> htmlNamedEntities = new Dictionary<string, string>();
            foreach(string line in lines)
            {
                string key=line.Substring(line.IndexOf(",") + 1);
                string value= line.Substring(0, line.IndexOf(","));
                if (!htmlNamedEntities.ContainsKey(key))
                    htmlNamedEntities.Add(key,value);
            }
            HtmlDocument.htmlSpecNamedEntities = htmlNamedEntities;
            HtmlDocument doc;
            string url;
            target.AutoDetectEncoding = true;
            //target cdata contains the char '"' and not contains &nbsp;
            url = "http://bt.popgo.net/";
            doc = target.Load(url);
            doc.OptionOutputAsXml = true;
            HtmlNode cdata = doc.DocumentNode.SelectSingleNode("*//body/div/script");
            Assert.AreEqual(cdata.WriteTo().Contains("\""), true);
            Assert.AreNotEqual(cdata.InnerText.Contains(@"&nbsp;"), true);
            //target page contains unicode ©
            url = "http://tlt.its.psu.edu/suggestions/international/web/codehtml.html";
            doc = target.Load(url);
            doc.OptionOutputAsXml = true;
            string content = doc.DocumentNode.WriteTo();
            Assert.AreEqual(content.Contains("©"), true);
        }
PATHS:(how can i upload files??)
HtmlDocument.CS
@@ -41,6 +41,9 @@
 		private Crc32 _crc32 = null;
 		private bool _onlyDetectEncoding = false;
 
+        public static System.Collections.Generic.IDictionary<string, string> htmlSpecNamedEntities;//for fix entities 
+        public static bool OptionEncedeHtmlEntities = false;//for fix entities
+
 		// public props
 
 		/// <summary>
@@ -219,8 +222,8 @@
 				throw new ArgumentNullException("html");
 			}
 			// replace & by &amp; but only once!
-			Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
-			return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
+            Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;)|(apos;))", RegexOptions.IgnoreCase);//Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
+            return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;").Replace("'","&apos;");//return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
 		}
 
 		/// <summary>
@@ -581,6 +584,17 @@
 			_declaredencoding = null;
 
 			_text = reader.ReadToEnd();
+
+            if (OptionEncedeHtmlEntities)//for fix entities
+            {
+                if (htmlSpecNamedEntities == null)
+                    throw new ArgumentNullException("HtmlSpecEntitiesMustAssignedBeforeConvertToXml");
+                foreach (var item in htmlSpecNamedEntities)
+                {
+                    _text = _text.Replace(item.Key, item.Value);
+                }
+            }
+
 			_documentnode = CreateNode(HtmlNodeType.Document, 0);
 			Parse();
 

HTMLWEB.cs
@@ -472,7 +472,30 @@
 						}
 						else
 						{
-							doc.Load(s, true);
+							//doc.Load(s, true);
+                            if (AutoDetectEncoding&&(DefaultEncoding==null))//attempt to get encoding from web
+                            {
+                                try//attempt to get encoding from response headers
+                                {
+                                    var rx = new System.Text.RegularExpressions.Regex("charset=(.+)");
+                                    var encodingStr = rx.Match(resp.Headers["Content-Type"]).Groups[1].Value;
+                                    DefaultEncoding = System.Text.Encoding.GetEncoding(encodingStr);
+                                }
+                                catch { }
+                                if (DefaultEncoding == null)//if previous fail,attempt to get encoding from html content
+                                {
+                                    var _detReq = WebRequest.Create(uri) as HttpWebRequest;
+                                    var _detResp = _detReq.GetResponse();
+                                    using (var _detRespStream = _detResp.GetResponseStream())
+                                    {
+                                        DefaultEncoding = doc.DetectEncoding(_detRespStream);
+                                    }
+                                    _detResp.Close();
+                                }
+                                if (DefaultEncoding == null)//if previous fail,get utf-8 as default encoding
+                                    DefaultEncoding = System.Text.Encoding.Default;
+                            }
+                            doc.Load(s, DefaultEncoding, true);
 						}
 					}
 				}
@@ -803,5 +826,18 @@
 				_usingCache = value;
 			}
 		}
+
+        private System.Text.Encoding _defaultEncoding;
+        public System.Text.Encoding DefaultEncoding
+        {
+            get
+            {
+                return _defaultEncoding;
+            }
+            set
+            {
+                _defaultEncoding = value;
+            }
+        }
 	}
 }

HTMLNODE.cs
@@ -1552,7 +1552,13 @@
 					html = ((HtmlTextNode)this).Text;
 					if (_ownerdocument.OptionOutputAsXml)
 					{
-						outText.Write(HtmlDocument.HtmlEncode(html));
+                        if (_cdataText)//fix cdata
+                        {
+                            outText.Write(html);
+                            _cdataText = false;//fix cdata
+                        }
+                        else//fix cdata
+						    outText.Write(HtmlDocument.HtmlEncode(html));
 					}
 					else
 					{
@@ -1755,6 +1761,7 @@
 			sw.Flush();
 			return sw.ToString();
 		}
+        internal static bool _cdataText = false;//fix cdata 
 	}
 
 }