我实现了一个简单的网络爬虫。我为抓取的 url 链接取了一个名为 crawled 的数组。您可能会在下面看到完整的代码。该代码运行良好,但未打印正确的链接。
import urllib2
def record_user_click(index,keyword,url):
urls = lookup(index, url)
if urls:
for entry in index:
if entry[0] == url:
entry[1] = entry[1] + 1
def add_to_index(index, keyword, url):
if keyword in index:
index[keyword].append(url)
else:
index[keyword] = [url]
def get_page(url):
try:
import urllib2
return urllib2.urlopen(url).read()
except:
return ""
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def get_next_target(page):
start_link = page.find('<a href=' or '" href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed, max_pages=200):
tocrawl = [seed]
crawled = []
graph = {}
index = {}
while tocrawl and len(crawled) < max_pages:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
#print crawled
return crawled, index, graph
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def lookup(index, keyword):
if keyword in index:
return index[keyword]
return None
crawled,index, graph = crawl_web('http://en.wikipedia.org/wiki/Information')
print crawled
当我执行程序时,它会显示链接。输出中的最后一个 url 是 javascript:bgscro(3) 但它不是有效的 url。我该如何解决这个问题?
[..., 'javascript:bgscro(3)']
最佳答案
看起来它正在寻找一个实际用于触发 javascript 函数的链接。如果您想忽略这些情况,看起来您可以将 get_all_links 函数编辑为某事:
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url and not url.startswith("javascript:"): # ignore javascript
links.append(url)
page = page[endpos:]
else:
break
return links
或者在处理之前过滤你的链接列表
outlinks = filter(lambda x: not x.startswith("javascript:"), outlinks)
您很可能会遇到很多这样的边缘情况。
关于python - 网络爬虫url地址失败,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24127395/