提取网页中的超链接C#代码

12年前
using System;   using System.Xml;   using System.Text;   using System.Net;   using System.IO;   using System.Collections;   using System.Text.RegularExpressions;   public class App   {   public static void Main()   {   string strCode;   ArrayList alLinks;   Console.Write("请输入一个网页地址:");   string strURL = Console.ReadLine();   if(strURL.Substring(0,7) != @"http://")   {   strURL = @"http://" + strURL;   }   Console.WriteLine("正在获取页面代码,请稍侯...");   strCode = GetPageSource(strURL);   Console.WriteLine("正在提取超链接,请稍侯...");   alLinks = GetHyperLinks(strCode);   Console.WriteLine("正在写入文件,请稍侯...");   WriteToXml(strURL,alLinks);   }   // 获取指定网页的HTML代码   static string GetPageSource(string URL)   {   Uri uri =new Uri(URL);   HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);   HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();   hwReq.Method = "Get";   hwReq.KeepAlive = false;   StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));   return reader.ReadToEnd();   }   // 提取HTML代码中的网址   static ArrayList GetHyperLinks(string htmlCode)   {   ArrayList al = new ArrayList();   string strRegex = @"http://([/w-]+/.)+[/w-]+(/[/w- ./?%&=]*)?";   Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);   MatchCollection m = r.Matches(htmlCode);   for(int i=0; i<=m.Count-1; i++)   {   bool rep = false;   string strNew = m[i].ToString();   // 过滤重复的URL   foreach(string str in al)   {   if(strNew==str)   {   rep =true;   break;   }   }   if(!rep) al.Add(strNew);   }   al.Sort();   return al;   }   // 把网址写入xml文件   static void WriteToXml(string strURL, ArrayList alHyperLinks)   {   XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);   writer.Formatting = Formatting.Indented;   writer.WriteStartDocument(false);   writer.WriteDocType("HyperLinks", null, "urls.dtd", null);   writer.WriteComment("提取自" + strURL + "的超链接");   writer.WriteStartElement("HyperLinks");   writer.WriteStartElement("HyperLinks", null);   writer.WriteAttributeString("DateTime",DateTime.Now.ToString());     foreach(string str in alHyperLinks)   {   string title = GetDomain(str);   string body = str;   writer.WriteElementString(title,null,body);   }   writer.WriteEndElement();   writer.WriteEndElement();   writer.Flush();   writer.Close();   }   // 获取网址的域名后缀   static string GetDomain(string strURL)   {   string retVal;   string strRegex = @"(/.com/|/.net/|/.cn/|/.org/|/.gov/)";   Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);   Match m = r.Match(strURL);   retVal = m.ToString();   strRegex = @"/.|/$";   retVal = Regex.Replace(retVal, strRegex, "").ToString();   if(retVal == "")   retVal = "other";   return retVal;   }   }