我正在尝试使用Python 制作一个网络爬虫。我从 Toby Segaran 的《编程集体智慧》一书中借用了这段代码。由于书中的代码已经过时,我做了一些必要的更改,但程序仍然没有按预期执行。这是我的代码:
import urllib
from urllib import request
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import bs4
# Create a list of words to ignore
ignorewords=set(['the','of','to','and','a','in','is','it'])
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
pass
def __del__(self): pass
def dbcommit(self):
pass
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
return None
# Index an individual page
def addtoindex(self,url,soup):
print('Indexing %s' % url)
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
return None
# Separate the words by any non-whitespace character
def separatewords(self,text):
return None
# Return true if this url is already indexed
def isindexed(self,url):
return False
# Add a link between two pages
def addlinkref(self,urlFrom,urlTo,linkText):
pass
# Starting with a list of pages, do a breadth
# first search to the given depth, indexing pages
# as we go
def crawl(self,pages,depth=2):
pass
# Create the database tables
def createindextables(self):
pass
def crawl(self,pages,depth=2):
for i in range(depth):
newpages=set( )
for page in pages:
try:
c=request.urlopen(page)
except:
print("Could not open %s" % page)
continue
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit( )
pages=newpages
pagelist=['http://google.com']
#pagelist=['file:///C:/Users/admin/Desktop/abcd.html']
crawler=crawler('')
crawler.crawl(pagelist)
我得到的唯一输出是 “索引http://google.com” “索引http://google.com” 按任意键继续...
每次我在页面列表中放置另一个链接时,我都会得到与“索引 xyz”相同的输出,其中 xyz 是我在页面列表中放置的每个链接。我还尝试制作一个包含大量 <a>
的 HTML 文件。标签,但它也不起作用。
最佳答案
问题出在您的行link=soup('a')
中。如果您想查找类“a”的元素,您应该使用名为“find_element_by...”的不同方法(参见 bs4 文档)
关于python - 我的网络爬虫无法与 BeautifulSoup 配合使用,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44779286/