asp获取远程url源代码并过滤掉所有HTML代码
作者:刚子 日期:2009-06-01
这个标题听起来费劲,我也不知道怎么表达,今天下午搞定的,发出来记录之,超实用的,绝对首发。
下面的代码保存为asp之后,打开浏览器输入http://你的网址/这个文件名字.asp?id=http://www.baidu.com,然后看效果啦,这也是网上那些模拟蜘蛛抓取程序的实现办法,效果是一样一样的。
程序代码
下面的代码保存为asp之后,打开浏览器输入http://你的网址/这个文件名字.asp?id=http://www.baidu.com,然后看效果啦,这也是网上那些模拟蜘蛛抓取程序的实现办法,效果是一样一样的。
程序代码<%
ID = Request.QueryString("ID")
function gethttppage(url)
dim adxmlhttp
set adxmlhttp = Server.createobject("microsoft.xmlhttp")
adxmlhttp.open "get",url,false
adxmlhttp.send()
if adxmlhttp.readystate <> 4 then exit function
gethttppage = Bytes2bStr(adxmlhttp.responsebody)
set adxmlhttp = nothing
End function
function Bytes2bStr(vin)
Dim BytesStream,StringReturn
Set BytesStream = Server.CreateObject("adodb.stream")
BytesStream.Type = 2
BytesStream.Open
BytesStream.WriteText vin
BytesStream.Position = 0
BytesStream.Charset = "GB2312"
BytesStream.Position = 2
StringReturn =BytesStream.ReadText
BytesStream.close
Set BytesStream = Nothing
Bytes2bStr = StringReturn
End function
url = ""&id&""
str = gethttppage(url)
%>
<%
gangzi=RemoveHTML(""&str&"")
Function RemoveHTML(Textstr)
Dim sStr, regEx
sStr = Textstr
Set regEx = New RegExp
regEx.IgnoreCase = True
regEx.Global = True
regEx.Multiline = True
regEx.Pattern = "<script[\s\S]*?</script>"
sStr = regEx.Replace(sStr, "")
regEx.Pattern = "<style[\s\S]*?</style>"
sStr = regEx.Replace(sStr, "")
regEx.Pattern = "\s[on].+?=([\""|\'])(.*?)\1"
sStr = regEx.Replace(sStr, "")
regEx.Pattern = "<(.[^>]*)>"
sStr = regEx.Replace(sStr, "")
Set regEx = Nothing
RemoveHTML = sStr
End Function
response.Write(""&gangzi&"")
%>
ID = Request.QueryString("ID")
function gethttppage(url)
dim adxmlhttp
set adxmlhttp = Server.createobject("microsoft.xmlhttp")
adxmlhttp.open "get",url,false
adxmlhttp.send()
if adxmlhttp.readystate <> 4 then exit function
gethttppage = Bytes2bStr(adxmlhttp.responsebody)
set adxmlhttp = nothing
End function
function Bytes2bStr(vin)
Dim BytesStream,StringReturn
Set BytesStream = Server.CreateObject("adodb.stream")
BytesStream.Type = 2
BytesStream.Open
BytesStream.WriteText vin
BytesStream.Position = 0
BytesStream.Charset = "GB2312"
BytesStream.Position = 2
StringReturn =BytesStream.ReadText
BytesStream.close
Set BytesStream = Nothing
Bytes2bStr = StringReturn
End function
url = ""&id&""
str = gethttppage(url)
%>
<%
gangzi=RemoveHTML(""&str&"")
Function RemoveHTML(Textstr)
Dim sStr, regEx
sStr = Textstr
Set regEx = New RegExp
regEx.IgnoreCase = True
regEx.Global = True
regEx.Multiline = True
regEx.Pattern = "<script[\s\S]*?</script>"
sStr = regEx.Replace(sStr, "")
regEx.Pattern = "<style[\s\S]*?</style>"
sStr = regEx.Replace(sStr, "")
regEx.Pattern = "\s[on].+?=([\""|\'])(.*?)\1"
sStr = regEx.Replace(sStr, "")
regEx.Pattern = "<(.[^>]*)>"
sStr = regEx.Replace(sStr, "")
Set regEx = Nothing
RemoveHTML = sStr
End Function
response.Write(""&gangzi&"")
%>
评论: 0 | 引用: 0 | 查看次数: -
发表评论
上一篇
下一篇

文章来自:
Tags: