百度文库下载器源代码

aivao 贡献于2013-12-30

作者 雨林木风  创建于2010-12-07 04:12:00   修改者微软用户  修改于2012-09-27 08:13:00字数16090

文档摘要:百度文库下载器源代码文件结构Zlib用于豆丁文档的解压缩,自己网上找开源代码,我也网上下载的,由于文件较多,所以没有贴出来,如果需要,跟帖我可以给链接,DOCIN.CS是支持豆丁网的下载类,由于还不支持V2版本.
关键词:

 百度文库下载器源代码 文件结构 Zlib用于豆丁文档的解压缩,自己网上找开源代码,我也网上下载的,由于文件较多,所以没有贴出来,如果需要,跟帖我可以给链接,DOCIN.CS是支持豆丁网的下载类,由于还不支持V2版本,所以暂时不公开 Global.cs using System; using System.Text; using System.Xml; using System.IO; using zlib; namespace DocDown { public class Global { public class _DATA_REGEXS { public static readonly string URL = @"^((http://www\.docin\.com/p-\d{8}\.html)|(http://wenku\.baidu\.com/view/[0-9a-z]{24}\.html))$"; public static readonly string DOCIN_DOC_INFO = @"^[\s\S]*playcontent = (\{.*?})[\s\S]*$"; public static readonly string BAIDU_DOC_INFO = @"^[\s\S]*DOC_INFO=(\{.*?})[\s\S]*$"; public static readonly string BAIDU_DATA = @"^(\{[\s\S]+?\})[\s\S]*?((CWS|FWS)\t[\s\S]*?)+$"; public static readonly string BAIDU_DATA_HEADER = @"^(\{[\s\S]+?\})([\s\S]+)$"; public static readonly string BAIDU_DATA_SWF = @"(CWS|FWS)\t[\s\S]*?(?=(CWS|FWS)\t|$)"; } public class _DATA_URL_FROMAT { public static readonly string BAIDU_DATA_URL_TXT = @"http://wenku.baidu.com/play/{0}?pn={1}&rn={2}"; public static readonly string BAIDU_DATA_URL_NOT_TXT = @"http://ai.wenku.baidu.com/play/{0}?pn={1}&rn={2}"; public static readonly string DOCIN_DATA_URL_ONE = @"http://file1.yimk.com/docin_{0}.docin"; public static readonly string DOCIN_DATA_URL_Next = @"http://file1.yimk.com/docin_{0}_{1}.docin"; } public class _FILE_PATH { public static readonly string TEMP_SMALL_PATH = @"TEMP"; public static readonly string TEMP_FULL_PATH = @"{0}\TEMP"; public static readonly string SWF_SMALL_PATH = @"TEMP\{0}.SWF"; public static readonly string SWF_FULL_PATH = @"{0}\TEMP\{1}.SWF"; public static readonly string JPG_SMALL_PATH = @"TEMP\{0}.JPG"; public static readonly string JPG_FULL_PATH = @"{0}\TEMP\{1}.JPG"; public static readonly string PDF_FULL_PATH = @"{0}\{1}.PDF"; public static readonly string TXT_FULL_PATH = @"{0}\{1}.TXT"; public static readonly string DOCIN_PATH = @"TEMP\{0}.DOCIN"; public static readonly string DOCIN_HEADER = @"TEMP\HEAD.DATA"; public static readonly string DOCIN_SWF = @"TEMP\SWF.DATA"; } public class _FILE_ENCODING { public static readonly Encoding EN=Encoding.GetEncoding("Windows-1252"); public static readonly Encoding CN=Encoding.GetEncoding("GB2312"); public static readonly Encoding FTF8 = Encoding.UTF8; } public class _JPG_SIZE { public static readonly int W=1190; public static readonly int H=1684; } public class _WORK_STATUS_INFO { public static readonly string START_WORK = "正在下载文库资源文件。。。"; public static readonly string CREATE_SWF = "正在生成SWF文件。。。"; public static readonly string CONVERT_JPG = "正在转换资源文件。。。"; public static readonly string CREATE_PDF = "正在生成PDF文件。。。"; public static readonly string DOWN_ERROR = "下载资源失败,请重新提交任务。"; public static readonly string SUCCESS = "任务已完成。"; public static readonly string BUSY = "处理中。。。"; public static readonly string START = "开始"; public static readonly string URL_ERROR = "文库地址解析错误,请确认文库地址的正确性。"; public static readonly string DOCIN_DOWN = "正在下载第 {0} 个资源"; public static readonly string DOCIN_DOWN_ERROR = "下载第 {0} 个资源失败,豆丁服务器异常,请重新尝试下载。"; } public class _WEB_DOMAIN { public static readonly string BAIDU = "BAIDU"; public static readonly string DOCIN = "DOCIN"; } public enum DOC_TYPE { DOC, PDF, PPT, XLS, TXT } public static void Clear() { if (Directory.Exists("temp")) Directory.Delete("temp", true); } public static void Decompress(string inFile, string outFile) { System.IO.FileStream outFileStream = new System.IO.FileStream(outFile, System.IO.FileMode.Create); ZOutputStream outZStream = new ZOutputStream(outFileStream); System.IO.FileStream inFileStream = new System.IO.FileStream(inFile, System.IO.FileMode.Open); try { byte[] buffer = new byte[2000]; int len; while ((len = inFileStream.Read(buffer, 0, 2000)) > 0) { outZStream.Write(buffer, 0, len); } outZStream.Flush(); } finally { outZStream.Close(); outFileStream.Close(); inFileStream.Close(); } } public static void Decompress(Stream inStream, long length, string outFile) { FileStream outFileStream = new FileStream(outFile, FileMode.Create); ZOutputStream outZStream = new ZOutputStream(outFileStream); try { byte[] buffer = new byte[2000]; while (length > 0) { if (length >= 2000) { inStream.Read(buffer, 0, 2000); outZStream.Write(buffer, 0, 2000); length -= 2000; } else { inStream.Read(buffer, 0, (int)length); outZStream.Write(buffer, 0, (int)length); break; } } outZStream.Flush(); } finally { outZStream.Close(); outFileStream.Close(); } } } } Iwork.cs using System; using System.Collections.Generic; using System.Text; namespace DocDown { interface Iwork { void Run(); } } Work.cs using System; using System.Text; using SWFToImage; using iTextSharp.text; using iTextSharp.text.pdf; using System.IO; namespace DocDown { public abstract class Work : Iwork { protected frmMain _frmMain; protected bool flag = false; protected int totalPage; public Work(frmMain _frmMain) { this._frmMain = _frmMain; } protected void GetJPG() { _frmMain.ChangelblMsgText(Global._WORK_STATUS_INFO.CONVERT_JPG); _frmMain.ChangepbarMainState(0); SWFToImageObject obj = new SWFToImageObject(); obj.ImageWidth = Global._JPG_SIZE.W; obj.ImageHeight = Global._JPG_SIZE.H; obj.ImageOutputType = SWFToImage.TImageOutputType.iotJPG; for (int i = 1; i <= totalPage; i++) { obj.InputSWFFileName = string.Format(Global._FILE_PATH.SWF_FULL_PATH, AppDomain.CurrentDomain.BaseDirectory, i); obj.Execute(); obj.SaveToFile(string.Format(Global._FILE_PATH.JPG_SMALL_PATH, i)); _frmMain.ChangepbarMainState(100 * i / totalPage); } } protected void GetPDF() { _frmMain.ChangelblMsgText(Global._WORK_STATUS_INFO.CREATE_PDF); Document doc = new Document(PageSize.A2, 30, 30, 5, 5); PdfWriter.GetInstance(doc, new FileStream(string.Format(Global._FILE_PATH.PDF_FULL_PATH, _frmMain._DOC_INFO.PDF_SAVE_DIR, _frmMain._DOC_INFO.DOC_NAME), FileMode.Create)); doc.Open(); StreamReader sr = null; for (int i = 1; i <= totalPage; i++) { sr = new StreamReader(string.Format(Global._FILE_PATH.JPG_SMALL_PATH, i)); doc.Add(iTextSharp.text.Image.GetInstance(sr.BaseStream)); sr.Close(); } doc.Close(); } public abstract void Run(); protected void DownError() { DownError(0); } protected void DownError(int page) { if (0 == page) _frmMain.ChangelblMsgText(Global._WORK_STATUS_INFO.DOWN_ERROR); else _frmMain.ChangelblMsgText(string.Format(Global._WORK_STATUS_INFO.DOCIN_DOWN_ERROR, page)); _frmMain.ChangeBtnSubmitText(true); flag = true; Global.Clear(); } } } UrlRequest.cs using System; using System.Net; using System.IO; using System.Text; using System.Text.RegularExpressions; namespace DocDown { public class UrlRequest { public static HttpWebResponse Send(string url) { int x = 5; HttpWebRequest hwr; ABC: try { hwr = (HttpWebRequest)WebRequest.Create(url); hwr.Method = "GET"; hwr.KeepAlive = false; return (HttpWebResponse)hwr.GetResponse(); } catch { if (x != 0) { x--; goto ABC; } else return null; } } public static string GetText(string url, Encoding enc) { string str = string.Empty; StreamReader reader = null; Stream stream = null; try { WebResponse response = Send(url); if (null == response) return string.Empty; stream = response.GetResponseStream(); reader = new StreamReader(stream, enc); str = reader.ReadToEnd(); } finally { if (null != stream) { stream.Close(); reader.Close(); } } return str; } public static string GetJson(string value, string key) { return Regex.Replace(value, string.Format("{{.*?{0}:\"(.*?)\".*}}", key), "$1"); } public static string GetJsons(string value, string key) { return Regex.Replace(value, string.Format("{{.*?{0}:\\[(.*?)\\].*}}", key), "$1"); } } } DOC_INFO.cs using System; using System.Text; namespace DocDown { public class DOC_INFO { public string _DOC_INFO; public string DOC_ID; public string DOC_NAME; public string DOC_TYPE; public int total_page; public int CON_SUC; public int PAGE_NUMBER; public string DATA_URL; public string DATA_REGEX; public string DATA_PAGE_REGEX; public string PDF_SAVE_DIR; public string DOMAIN; public string RN; } } frmMain.cs using System; using System.Text; using System.Windows.Forms; using System.Text.RegularExpressions; using System.IO; using System.Threading; namespace DocDown { public partial class frmMain : Form { #region _DOC public DOC_INFO _DOC_INFO; #endregion #region 其他 private Thread _GetPDF = null; #endregion #region delegate delegate void btnSubmitCallback(bool value); delegate void pbarMainCallback(int value); delegate void lblMsgCallback(string value); public void ChangeBtnSubmitText(bool value) { if (this.btnSubmit.InvokeRequired) this.btnSubmit.Invoke(new btnSubmitCallback(ChangeBtnSubmitText), new object[] { value }); else { this.btnSubmit.Enabled = value; this.btnSubmit.Text = value ? "开始" : "处理中。。。"; } } public void ChangepbarMainState(int value) { if (this.pbarMain.InvokeRequired) this.pbarMain.Invoke(new pbarMainCallback(ChangepbarMainState), new object[] { value }); else this.pbarMain.Value = value > 100 ? 100 : value; } public void ChangelblMsgText(string value) { if (this.lblMsg.InvokeRequired) this.lblMsg.Invoke(new lblMsgCallback(ChangelblMsgText), new object[] { value }); else this.lblMsg.Text = value; } #endregion public frmMain() { InitializeComponent(); } private void btnSubmit_Click(object sender, EventArgs e) { _DOC_INFO = new DOC_INFO(); Iwork work = null; #region 文件地址有效性判断 if (!Regex.IsMatch(txtUrl.Text.Trim(), Global._DATA_REGEXS.URL, RegexOptions.IgnoreCase)) { MessageBox.Show(Global._WORK_STATUS_INFO.URL_ERROR); return; } Match mm = Regex.Match(UrlRequest.GetText(txtUrl.Text.Trim(), Global._FILE_ENCODING.CN), Global._DATA_REGEXS.BAIDU_DOC_INFO); if (mm.Success) { _DOC_INFO.DOMAIN = "BAIDU"; goto FLAG; } mm = Regex.Match(UrlRequest.GetText(txtUrl.Text.Trim(), Encoding.UTF8), Global._DATA_REGEXS.DOCIN_DOC_INFO); if (mm.Success) { _DOC_INFO.DOMAIN = "DOCIN"; goto FLAG; } MessageBox.Show(Global._WORK_STATUS_INFO.URL_ERROR); return; #endregion FLAG: #region 初始化信息 //初始化界面控件 lblMsg.Text = string.Empty; pbarMain.Value = 0; btnSubmit.Enabled = false; fbdSaveDir.ShowDialog(); btnSubmit.Text = Global._WORK_STATUS_INFO.BUSY; //初始化文件信息 if (Global._WEB_DOMAIN.BAIDU == _DOC_INFO.DOMAIN) { _DOC_INFO._DOC_INFO = mm.Groups[1].Value; _DOC_INFO.DOC_ID = UrlRequest.GetJson(_DOC_INFO._DOC_INFO, "doc_id"); _DOC_INFO.DOC_NAME = UrlRequest.GetJson(_DOC_INFO._DOC_INFO, "doc_name"); _DOC_INFO.DOC_TYPE = UrlRequest.GetJson(_DOC_INFO._DOC_INFO, "doc_type"); if (Global.DOC_TYPE.TXT.ToString() == _DOC_INFO.DOC_TYPE.ToUpper()) { _DOC_INFO.RN = "50"; _DOC_INFO.DATA_URL = Global._DATA_URL_FROMAT.BAIDU_DATA_URL_TXT; _DOC_INFO.DATA_REGEX = Global._DATA_REGEXS.BAIDU_DATA_HEADER; } else { _DOC_INFO.DATA_URL = Global._DATA_URL_FROMAT.BAIDU_DATA_URL_NOT_TXT; _DOC_INFO.DATA_REGEX = Global._DATA_REGEXS.BAIDU_DATA; _DOC_INFO.DATA_PAGE_REGEX = Global._DATA_REGEXS.BAIDU_DATA_SWF; _DOC_INFO.RN = "5"; } work = new BAIDU(this); } else { _DOC_INFO._DOC_INFO = mm.Groups[1].Value; _DOC_INFO.DOC_ID = Regex.Replace(txtUrl.Text.Trim().ToLower(), @"http://www\.docin\.com/p-(\d+?)\.html", "$1"); //_DOC_INFO.DATA_URL = Global._DATA_URL_FROMA; _DOC_INFO.PAGE_NUMBER = int.Parse(UrlRequest.GetJsons(_DOC_INFO._DOC_INFO, "\"pageNum\"")); _DOC_INFO.DOC_NAME = UrlRequest.GetJsons(_DOC_INFO._DOC_INFO, "\"pdtTitle\"").Trim('"'); _DOC_INFO.DOC_TYPE = UrlRequest.GetJsons(_DOC_INFO._DOC_INFO, "\"pdtType\"").Trim('"'); _DOC_INFO.CON_SUC = int.Parse(UrlRequest.GetJsons(_DOC_INFO._DOC_INFO, "\"converSuccess\"")); work = new DOCIN(this); } //其他 _DOC_INFO.PDF_SAVE_DIR = fbdSaveDir.SelectedPath; //初始化临时文件目录 if (!Directory.Exists(Global._FILE_PATH.TEMP_SMALL_PATH)) Directory.CreateDirectory(Global._FILE_PATH.TEMP_SMALL_PATH); #endregion #region 启动线程 _GetPDF = new Thread(new ThreadStart(work.Run)); _GetPDF.IsBackground = true; _GetPDF.Start(); #endregion } private void frmMain_FormClosing(object sender, FormClosingEventArgs e) { Global.Clear(); } } } BAIDU.cs using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using SWFToImage; using iTextSharp.text; using iTextSharp.text.pdf; using System.IO; namespace DocDown { public class BAIDU : Work { public BAIDU(frmMain _frmMain):base(_frmMain) { } public override void Run() { if (_frmMain._DOC_INFO.DOC_TYPE.ToUpper() == Global.DOC_TYPE.TXT.ToString()) { GetTXT(1); } else { GetSWF(1); if (flag) return; GetJPG(); GetPDF(); } Global.Clear(); _frmMain.ChangelblMsgText(Global._WORK_STATUS_INFO.SUCCESS); _frmMain.ChangeBtnSubmitText(true); } private void GetSWF(int fromPage) { if (fromPage == 1) _frmMain.ChangelblMsgText(Global._WORK_STATUS_INFO.START_WORK); int toPage = 0; string value = UrlRequest.GetText(string.Format(_frmMain._DOC_INFO.DATA_URL, _frmMain._DOC_INFO.DOC_ID, fromPage, _frmMain._DOC_INFO.RN), Global._FILE_ENCODING.EN); if (string.IsNullOrEmpty(value)) { DownError(); return; } Match match = Regex.Match(value, _frmMain._DOC_INFO.DATA_REGEX); if (match.Success) { string swf_info = match.Groups[1].Value; toPage = int.Parse(UrlRequest.GetJson(swf_info, "\"toPage\"")); if (1 == fromPage) totalPage = int.Parse(UrlRequest.GetJson(swf_info, "\"totalPage\"")); } else { DownError(); return; } MatchCollection MC = Regex.Matches(value, _frmMain._DOC_INFO.DATA_PAGE_REGEX); for (int i = fromPage, j = 0; i <= toPage; i++, j++) { System.IO.File.WriteAllText(string.Format(Global._FILE_PATH.SWF_SMALL_PATH, i), MC[j].Value, Global._FILE_ENCODING.EN); _frmMain.ChangepbarMainState(100 * i / totalPage); } if (toPage < totalPage) GetSWF(toPage + 1); } private void GetTXT(int fromPage) { if (fromPage == 1) _frmMain.ChangelblMsgText(Global._WORK_STATUS_INFO.START_WORK); int toPage = 0; string value = UrlRequest.GetText(string.Format(_frmMain._DOC_INFO.DATA_URL, _frmMain._DOC_INFO.DOC_ID, fromPage, _frmMain._DOC_INFO.RN), Global._FILE_ENCODING.EN); if (string.IsNullOrEmpty(value)) { DownError(); return; } Match match = Regex.Match(value, _frmMain._DOC_INFO.DATA_REGEX); if (match.Success) { string swf_info = match.Groups[1].Value; toPage = int.Parse(UrlRequest.GetJson(swf_info, "\"toPage\"")); if (1 == fromPage) totalPage = int.Parse(UrlRequest.GetJson(swf_info, "\"totalPage\"")); } else { DownError(); return; } System.IO.File.AppendAllText(string.Format(Global._FILE_PATH.TXT_FULL_PATH, _frmMain._DOC_INFO.PDF_SAVE_DIR, _frmMain._DOC_INFO.DOC_NAME), match.Groups[2].Value, Global._FILE_ENCODING.EN); _frmMain.ChangepbarMainState(100 * toPage / totalPage); if (toPage < totalPage) GetTXT(toPage + 1); } } } frmMain.Designer.cs namespace DocDown { partial class frmMain { /// /// Required designer variable. /// private System.ComponentModel.IContainer components = null; /// /// Clean up any resources being used. /// /// true if managed resources should be disposed; otherwise, false. protected override void Dispose(bool disposing) { if (disposing && (components != null)) { components.Dispose(); } base.Dispose(disposing); } #region Windows Form Designer generated code /// /// Required method for Designer support - do not modify /// the contents of this method with the code editor. /// private void InitializeComponent() { this.gboxUrl = new System.Windows.Forms.GroupBox(); this.txtUrl = new System.Windows.Forms.TextBox(); this.btnSubmit = new System.Windows.Forms.Button(); this.pbarMain = new System.Windows.Forms.ProgressBar(); this.lblMsg = new System.Windows.Forms.Label(); this.fbdSaveDir = new System.Windows.Forms.FolderBrowserDialog(); this.gboxUrl.SuspendLayout(); this.SuspendLayout(); // // gboxUrl // this.gboxUrl.Controls.Add(this.txtUrl); this.gboxUrl.Location = new System.Drawing.Point(1, 5); this.gboxUrl.Name = "gboxUrl"; this.gboxUrl.Padding = new System.Windows.Forms.Padding(4, 3, 4, 3); this.gboxUrl.Size = new System.Drawing.Size(400, 47); this.gboxUrl.TabIndex = 2; this.gboxUrl.TabStop = false; this.gboxUrl.Text = "文库地址"; // // txtUrl // this.txtUrl.Dock = System.Windows.Forms.DockStyle.Top; this.txtUrl.Location = new System.Drawing.Point(4, 17); this.txtUrl.Name = "txtUrl"; this.txtUrl.Size = new System.Drawing.Size(392, 21); this.txtUrl.TabIndex = 1; // // btnSubmit // this.btnSubmit.Location = new System.Drawing.Point(407, 11); this.btnSubmit.Name = "btnSubmit"; this.btnSubmit.Size = new System.Drawing.Size(98, 41); this.btnSubmit.TabIndex = 0; this.btnSubmit.Text = "开始"; this.btnSubmit.Click += new System.EventHandler(this.btnSubmit_Click); // // pbarMain // this.pbarMain.Dock = System.Windows.Forms.DockStyle.Bottom; this.pbarMain.Location = new System.Drawing.Point(0, 71); this.pbarMain.Name = "pbarMain"; this.pbarMain.Size = new System.Drawing.Size(506, 20); this.pbarMain.TabIndex = 1; // // lblMsg // this.lblMsg.AutoSize = true; this.lblMsg.Location = new System.Drawing.Point(10, 56); this.lblMsg.Name = "lblMsg"; this.lblMsg.Size = new System.Drawing.Size(0, 12); // // fbdSaveDir // this.fbdSaveDir.Description = "PDF文件保存路径"; // // frmMain // this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 12F); this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; this.ClientSize = new System.Drawing.Size(506, 91); this.Controls.Add(this.lblMsg); this.Controls.Add(this.pbarMain); this.Controls.Add(this.btnSubmit); this.Controls.Add(this.gboxUrl); this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedSingle; this.MaximizeBox = false; this.Name = "frmMain"; this.Text = "百度文库下载器"; this.FormClosing += new System.Windows.Forms.FormClosingEventHandler(this.frmMain_FormClosing); this.gboxUrl.ResumeLayout(false); this.gboxUrl.PerformLayout(); this.ResumeLayout(false); this.PerformLayout(); } #endregion private System.Windows.Forms.GroupBox gboxUrl; private System.Windows.Forms.TextBox txtUrl; private System.Windows.Forms.Button btnSubmit; private System.Windows.Forms.ProgressBar pbarMain; private System.Windows.Forms.Label lblMsg; private System.Windows.Forms.FolderBrowserDialog fbdSaveDir; } }

下载文档到电脑,查找使用更方便

文档的实际排版效果,会与网站的显示效果略有不同!!

需要 3 金币 [ 分享文档获得金币 ] 0 人已下载

下载文档