python 网络爬虫

爬虫基本操作

requests-请求是否成功

积累程序-爬虫 -requests-请求是否成功.

import requests
response=requests.get("https://www.pku.edu.cn")
print(response.status_code)#用来检查请求是否正确响应，如果状态码是200,代表请求成功。
#4XX，客户端错误，403，禁止访问。5XX,服务器错误，503,服务器不可用。3XX，重定向，305,应使用代理访问。1XX，请求收到。2XX，请求成功。

将文件写入本地

import requests


webFile=requests.get("https://www.pku.edu.cn/about.html")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现
print(data)

with open(r"E:/myDownload.html","w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
    file1.write(data)

或者用如下代码

#【舉例】
import requests


webFile=requests.get("https://www.pku.edu.cn/about.html")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现
print(data)

with open(r"E:/myDownload.html","w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
    file1.write(data)


#【舉例】

for i in range(a,a+3):#调试的时候将b换成a+1
    webUrl="https://zh.m.wikisource.org/wiki/春秋左傳正義/卷"+str(i)
    urlList.append(webUrl)

    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"

    data=webFile.text

    myDfile="myDownload"+str(i)+".html"

#第一種
    with open(myDfile,"w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
        file1.write(data)
#第二種，用的是write,而不是print
    wFile=open(myDfile,"w",encoding="utf-8")
    wFile.write(data)
    wFile.close()

观察网站结构

在爬取过程中，需要观察网站的结构。

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
</body>
</html>
"""
 
 
from bs4 import BeautifulSoup

 
soup=BeautifulSoup(html,'html.parser')

#print(soup)
#print(type(soup))#BeautifulSou

tag=soup.find('p')
#print(tag)#Tag
string1=tag.string
#print(string1)#NavigableString
 

soup2=BeautifulSoup("<b><!--Hey--></b>",'html.parser')
comment=soup2.b.string
#print(comment)
#print(type(comment))


soup3=BeautifulSoup('<ad id=123 class="red bule">Hey</ad>','html.parser')
tag=soup3.ad
##print(tag.name)#ad是自己定义的，命名的。
##print(tag.attrs)

##Tag对象类似于HTML文档的标签.
##
##对于标签来说,最重要的就是名字name和属性attrs.


#修改soup的信息
soup=BeautifulSoup('<p id=123 class="red bule">Hey</p>','html.parser')
tag=soup.p
tag.name='a'
tag.attrs['id']=456
tag.attrs['class'][0]='white'
#print(soup)


from bs4 import BeautifulSoup
 
soup=BeautifulSoup('<p>Hey</p>','html.parser')
tag=soup.p
##print(tag)
##string=tag.string
##print(string)
##print(type(string))
##
##print(string.split('e'))
##
##print(string.lower())


#NavigableString同样可以被直接修改,也可以使用repalce_with的方法来修改.
from bs4 import BeautifulSoup
 
soup=BeautifulSoup('<p>Hey</p>','html.parser')
tag=soup.p
a='Heloo'
tag.string=a
##print(soup)
##tag.string.replace_with('KO')
##print(soup)

html = """
<div>Total
    <p class="story"> First_p
        <a id="1">El</a>,
        <a id="2">E2</a>,
        <a id="3">E3</a>,
    </p>
    <p>Second_p</p>
</div>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html,'html.parser')
#print(soup)
tag=soup.p
#print(tag)
#首先,存在多个标签时,使用标签名称取到的永远是第一个该标签.其余返回的是none
##
##print(len(tag.contents))
##print(tag.contents)


html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<div>
<p class="a title"><b>The Dormouse's story</b></p>
<p class="a story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
</div>
<div>
<p class="st">Last<p class="st">......</p></p>
</div>
"""
 
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

#标签就是HTML的标签.比如搜索html文档中的所有a标签.
#print(soup.find_all('a'))


#可以接受正则表达式作为过滤,比如所有名称中包含'a'的标签.
print()
import  re 
#print(soup.find_all(re.compile('a')))
##
##
##列表中所包含的元素都将作为过滤标准,比如搜索所有的a标签和b标签.
#print(soup.find_all(['a','b']))
#print()



#print(soup.find_all('p')[1].find_all(True))
##输出
##[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
## <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
## <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


def filter(tag):
    return tag['id']=='link2'

#print(soup.find_all('p')[1])
 
##print(soup.find_all('p')[1].find_all(filter))
##
##print(soup.find_all('p')[1].find_all(filter))
#输出:
#[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

print(soup.select('.st'))

#BeautifulSoup提供了一个prettify()方法可以对不完整或者不规范的HTML文档进行规整.

改变后面的备注名。htm,或者txt，就可以改变格式。很神奇。
如何创建htm？网上可搜。创建一个网站，可以用网络打开。

电脑信息字段

有时候，需要一个电脑信息字段

import requests
import csv


headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',

    'limit': '20',
    'sort_by': 'created'
}

beautiful soup的使用

Help on class BeautifulSoup in module bs4:

class BeautifulSoup(bs4.element.Tag)
 |  BeautifulSoup(markup='', features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs)
 |
 |  A data structure representing a parsed HTML or XML document.
 |
 |  Most of the methods you'll call on a BeautifulSoup object are inherited from
 |  PageElement or Tag.
 |
 |  Internally, this class defines the basic interface called by the
 |  tree builders when converting an HTML/XML document into a data
 |  structure. The interface abstracts away the differences between
 |  parsers. To write a new tree builder, you'll need to understand
 |  these methods as a whole.
 |
 |  These methods will be called by the BeautifulSoup constructor:
 |    * reset()
 |    * feed(markup)
 |
 |  The tree builder may call these methods from its feed() implementation:
 |    * handle_starttag(name, attrs) # See note about return value
 |    * handle_endtag(name)
 |    * handle_data(data) # Appends to the current data node
 |    * endData(containerClass) # Ends the current data node
 |
 |  No matter how complicated the underlying parser is, you should be
 |  able to build a tree using 'start tag' events, 'end tag' events,
 |  'data' events, and "done with data" events.
 |
 |  If you encounter an empty-element tag (aka a self-closing tag,
 |  like HTML's <br> tag), call handle_starttag and then
 |  handle_endtag.
 |
 |  Method resolution order:
 |      BeautifulSoup
 |      bs4.element.Tag
 |      bs4.element.PageElement
 |      builtins.object
 |
 |  Methods defined here:
 |
 |  __copy__(self)
 |      Copy a BeautifulSoup object by converting the document to a string and parsing it again.
 |
 |  __getstate__(self)
 |
 |      Constructor.
 |
 |      :param markup: A string or a file-like object representing
 |       markup to be parsed.
 |
 |      :param features: Desirable features of the parser to be
 |       used. This may be the name of a specific parser ("lxml",
 |       "lxml-xml", "html.parser", or "html5lib") or it may be the
 |       type of markup to be used ("html", "html5", "xml"). It's
 |       recommended that you name a specific parser, so that
 |       Beautiful Soup gives you the same results across platforms
 |       and virtual environments.
 |
 |      :param builder: A TreeBuilder subclass to instantiate (or
 |       instance to use) instead of looking one up based on
 |       `features`. You only need to use this if you've implemented a
 |       custom TreeBuilder.
 |
 |      :param parse_only: A SoupStrainer. Only parts of the document
 |       matching the SoupStrainer will be considered. This is useful
 |       when parsing part of a document that would otherwise be too
 |       large to fit into memory.
 |
 |      :param from_encoding: A string indicating the encoding of the
 |       document to be parsed. Pass this in if Beautiful Soup is
 |       guessing wrongly about the document's encoding.
 |
 |      :param exclude_encodings: A list of strings indicating
 |       encodings known to be wrong. Pass this in if you don't know
 |       the document's encoding but you know Beautiful Soup's guess is
 |       wrong.
 |
 |      :param element_classes: A dictionary mapping BeautifulSoup
 |       classes like Tag and NavigableString, to other classes you'd
 |       like to be instantiated instead as the parse tree is
 |       built. This is useful for subclassing Tag or NavigableString
 |       to modify default behavior.
 |
 |      :param kwargs: For backwards compatibility purposes, the
 |       constructor accepts certain keyword arguments used in
 |       Beautiful Soup 3. None of these arguments do anything in
 |       Beautiful Soup 4; they will result in a warning and then be
 |       ignored.
 |
 |       Apart from this, any keyword arguments passed into the
 |       BeautifulSoup constructor are propagated to the TreeBuilder
 |       constructor. This makes it possible to configure a
 |       TreeBuilder by passing in arguments, not just by saying which
 |       one to use.
 |
 |  decode(self, pretty_print=False, eventual_encoding='utf-8', formatter='minimal')
 |      Returns a string or Unicode representation of the parse tree
 |          as an HTML or XML document.
 |
 |      :param pretty_print: If this is True, indentation will be used to
 |          make the document more readable.
 |      :param eventual_encoding: The encoding of the final document.
 |          If this is None, the document will be a Unicode string.
 |
 |  endData(self, containerClass=None)
 |      Method called by the TreeBuilder when the end of a data segment
 |      occurs.
 |
 |  handle_data(self, data)
 |      Called by the tree builder when a chunk of textual data is encountered.
 |
 |  handle_endtag(self, name, nsprefix=None)
 |      Called by the tree builder when an ending tag is encountered.
 |
 |      :param name: Name of the tag.
 |      :param nsprefix: Namespace prefix for the tag.
 |
 |  handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, sourcepos=None)
 |      Called by the tree builder when a new tag is encountered.
 |
 |      :param name: Name of the tag.
 |      :param nsprefix: Namespace prefix for the tag.
 |      :param attrs: A dictionary of attribute values.
 |      :param sourceline: The line number where this tag was found in its
 |          source document.
 |      :param sourcepos: The character position within `sourceline` where this
 |          tag was found.
 |
 |      If this method returns None, the tag was rejected by an active
 |      SoupStrainer. You should proceed as if the tag had not occurred
 |      in the document. For instance, if this was a self-closing tag,
 |      don't call handle_endtag.
 |
 |  insert_after(self, successor)
 |      This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
 |      it because there is nothing before or after it in the parse tree.
 |
 |  insert_before(self, successor)
 |      This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
 |      it because there is nothing before or after it in the parse tree.
 |
 |  new_string(self, s, subclass=None)
 |      Create a new NavigableString associated with this BeautifulSoup
 |      object.
 |
 |  new_tag(self, name, namespace=None, nsprefix=None, attrs={}, sourceline=None, sourcepos=None, **kwattrs)
 |      Create a new Tag associated with this BeautifulSoup object.
 |
 |  object_was_parsed(self, o, parent=None, most_recent_element=None)
 |      Method called by the TreeBuilder to integrate an object into the parse tree.
 |
 |  popTag(self)
 |      Internal method called by _popToTag when a tag is closed.
 |
 |  pushTag(self, tag)
 |      Internal method called by handle_starttag when a tag is opened.
 |
 |  reset(self)
 |      Reset this object to a state as though it had never parsed any
 |      markup.
 |
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |
 |  ASCII_SPACES = ' \n\t\x0c\r'
 |
 |  DEFAULT_BUILDER_FEATURES = ['html', 'fast']
 |
 |  NO_PARSER_SPECIFIED_WARNING = 'No parser was explicitly specified, so ...
 |
 |  ROOT_TAG_NAME = '[document]'
 |
 |  ----------------------------------------------------------------------
 |  Methods inherited from bs4.element.Tag:
 |
 |  __bool__(self)
 |      A tag is non-None even if it has no contents.
 |
 |  __call__(self, *args, **kwargs)
 |      Calling a Tag like a function is the same as calling its
 |      find_all() method. Eg. tag('a') returns a list of all the A tags
 |      found within this tag.
 |
 |  __contains__(self, x)
 |
 |  __delitem__(self, key)
 |      Deleting tag[key] deletes all 'key' attributes for the tag.
 |
 |  __eq__(self, other)
 |      Returns true iff this Tag has the same name, the same attributes,
 |      and the same contents (recursively) as `other`.
 |
 |  __getattr__(self, tag)
 |      Calling tag.subtag is the same as calling tag.find(name="subtag")
 |
 |  __getitem__(self, key)
 |      tag[key] returns the value of the 'key' attribute for the Tag,
 |      and throws an exception if it's not there.
 |
 |  __hash__(self)
 |      Return hash(self).
 |
 |  __iter__(self)
 |      Iterating over a Tag iterates over its contents.
 |
 |  __len__(self)
 |      The length of a Tag is the length of its list of contents.
 |
 |  __ne__(self, other)
 |      Returns true iff this Tag is not identical to `other`,
 |      as defined in __eq__.
 |
 |  __repr__ = __unicode__(self)
 |
 |  __setitem__(self, key, value)
 |      Setting tag[key] sets the value of the 'key' attribute for the
 |      tag.
 |
 |  __str__ = __unicode__(self)
 |
 |  __unicode__(self)
 |      Renders this PageElement as a Unicode string.
 |
 |  childGenerator(self)
 |      Deprecated generator.
 |
 |  clear(self, decompose=False)
 |      Wipe out all children of this PageElement by calling extract()
 |         on them.
 |
 |      :param decompose: If this is True, decompose() (a more
 |          destructive method) will be called instead of extract().
 |
 |  decode_contents(self, indent_level=None, eventual_encoding='utf-8', formatter='minimal')
 |      Renders the contents of this tag as a Unicode string.
 |
 |      :param indent_level: Each line of the rendering will be
 |         indented this many spaces. Used internally in
 |         recursive calls while pretty-printing.
 |
 |      :param eventual_encoding: The tag is destined to be
 |         encoded into this encoding. decode_contents() is _not_
 |         responsible for performing that encoding. This information
 |         is passed in so that it can be substituted in if the
 |         document contains a <META> tag that mentions the document's
 |         encoding.
 |
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard Formatters.
 |
 |  decompose(self)
 |      Recursively destroys this PageElement and its children.
 |
 |      This element will be removed from the tree and wiped out; so
 |      will everything beneath it.
 |
 |  encode(self, encoding='utf-8', indent_level=None, formatter='minimal', errors='xmlcharrefreplace')
 |      Render a bytestring representation of this PageElement and its
 |      contents.
 |
 |      :param encoding: The destination encoding.
 |      :param indent_level: Each line of the rendering will be
 |          indented this many spaces. Used internally in
 |          recursive calls while pretty-printing.
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard formatters.
 |      :param errors: An error handling strategy such as
 |          'xmlcharrefreplace'. This value is passed along into
 |          encode() and its value should be one of the constants
 |          defined by Python.
 |      :return: A bytestring.
 |
 |  encode_contents(self, indent_level=None, encoding='utf-8', formatter='minimal')
 |      Renders the contents of this PageElement as a bytestring.
 |
 |      :param indent_level: Each line of the rendering will be
 |         indented this many spaces. Used internally in
 |         recursive calls while pretty-printing.
 |
 |      :param eventual_encoding: The bytestring will be in this encoding.
 |
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard Formatters.
 |
 |      :return: A bytestring.
 |
 |  find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
 |      Look in the children of this PageElement and find the first
 |      PageElement that matches the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param recursive: If this is True, find() will perform a
 |          recursive search of this PageElement's children. Otherwise,
 |          only the direct children will be considered.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  findAll = find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |
 |  findChild = find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
 |
 |  findChildren = find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |
 |  find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
 |      Look in the children of this PageElement and find all
 |      PageElements that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param recursive: If this is True, find_all() will perform a
 |          recursive search of this PageElement's children. Otherwise,
 |          only the direct children will be considered.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  get(self, key, default=None)
 |      Returns the value of the 'key' attribute for the tag, or
 |      the value given for 'default' if it doesn't have that
 |      attribute.
 |
 |
 |  get_attribute_list(self, key, default=None)
 |      The same as get(), but always returns a list.
 |
 |      :param key: The attribute to look for.
 |      :param default: Use this value if the attribute is not present
 |          on this PageElement.
 |      :return: A list of values, probably containing only a single
 |          value.
 |
 |      Get all child strings, concatenated using the given separator.
 |
 |      :param separator: Strings will be concatenated using this separator.
 |
 |      :param strip: If True, strings will be stripped before being
 |          concatenated.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :return: A string.
 |
 |  has_attr(self, key)
 |      Does this PageElement have an attribute with the given name?
 |
 |  has_key(self, key)
 |      Deprecated method. This was kind of misleading because has_key()
 |      (attributes) was different from __in__ (contents).
 |
 |      has_key() is gone in Python 3, anyway.
 |
 |  index(self, element)
 |      Find the index of a child by identity, not value.
 |
 |      Avoids issues with tag.contents.index(element) getting the
 |      index of equal elements.
 |
 |      :param element: Look for this PageElement in `self.contents`.
 |
 |  prettify(self, encoding=None, formatter='minimal')
 |      Pretty-print this PageElement as a string.
 |
 |      :param encoding: The eventual encoding of the string. If this is None,
 |          a Unicode string will be returned.
 |      :param formatter: A Formatter object, or a string naming one of
 |          the standard formatters.
 |      :return: A Unicode string (if encoding==None) or a bytestring
 |          (otherwise).
 |
 |  recursiveChildGenerator(self)
 |      Deprecated generator.
 |
 |  renderContents(self, encoding='utf-8', prettyPrint=False, indentLevel=0)
 |      Deprecated method for BS3 compatibility.
 |
 |  select(self, selector, namespaces=None, limit=None, **kwargs)
 |      Perform a CSS selection operation on the current element.
 |
 |      This uses the SoupSieve library.
 |
 |      :param selector: A string containing a CSS selector.
 |
 |      :param namespaces: A dictionary mapping namespace prefixes
 |         used in the CSS selector to namespace URIs. By default,
 |         Beautiful Soup will use the prefixes it encountered while
 |         parsing the document.
 |
 |      :param limit: After finding this number of results, stop looking.
 |
 |      :param kwargs: Keyword arguments to be passed into SoupSieve's
 |         soupsieve.select() method.
 |
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  select_one(self, selector, namespaces=None, **kwargs)
 |      Perform a CSS selection operation on the current element.
 |
 |      :param selector: A CSS selector.
 |
 |      :param namespaces: A dictionary mapping namespace prefixes
 |         used in the CSS selector to namespace URIs. By default,
 |         Beautiful Soup will use the prefixes it encountered while
 |         parsing the document.
 |
 |      :param kwargs: Keyword arguments to be passed into SoupSieve's
 |         soupsieve.select() method.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  smooth(self)
 |      Smooth out this element's children by consolidating consecutive
 |      strings.
 |
 |      This makes pretty-printed output look more natural following a
 |      lot of operations that modified the tree.
 |
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from bs4.element.Tag:
 |
 |  children
 |      Iterate over all direct children of this PageElement.
 |
 |      :yield: A sequence of PageElements.
 |
 |  descendants
 |      Iterate over all children of this PageElement in a
 |      breadth-first sequence.
 |
 |      :yield: A sequence of PageElements.
 |
 |  isSelfClosing
 |      Is this tag an empty-element tag? (aka a self-closing tag)
 |
 |      A tag that has contents is never an empty-element tag.
 |
 |      A tag that has no contents may or may not be an empty-element
 |      tag. It depends on the builder used to create the tag. If the
 |      builder has a designated list of empty-element tags, then only
 |      a tag whose name shows up in that list is considered an
 |      empty-element tag.
 |
 |      If the builder has no designated list of empty-element tags,
 |      then any tag with no contents is an empty-element tag.
 |
 |  is_empty_element
 |      Is this tag an empty-element tag? (aka a self-closing tag)
 |
 |      A tag that has contents is never an empty-element tag.
 |
 |      A tag that has no contents may or may not be an empty-element
 |      tag. It depends on the builder used to create the tag. If the
 |      builder has a designated list of empty-element tags, then only
 |      a tag whose name shows up in that list is considered an
 |      empty-element tag.
 |
 |      If the builder has no designated list of empty-element tags,
 |      then any tag with no contents is an empty-element tag.
 |
 |  strings
 |      Yield all strings of certain classes, possibly stripping them.
 |
 |      :param strip: If True, all strings will be stripped before being
 |          yielded.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :yield: A sequence of strings.
 |
 |  stripped_strings
 |      Yield all strings in the document, stripping them first.
 |
 |      :yield: A sequence of stripped strings.
 |
 |  text
 |      Get all child strings, concatenated using the given separator.
 |
 |      :param separator: Strings will be concatenated using this separator.
 |
 |      :param strip: If True, strings will be stripped before being
 |          concatenated.
 |
 |      :types: A tuple of NavigableString subclasses. Any strings of
 |          a subclass not found in this list will be ignored. By
 |          default, this means only NavigableString and CData objects
 |          will be considered. So no comments, processing instructions,
 |          etc.
 |
 |      :return: A string.
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from bs4.element.Tag:
 |
 |  parserClass
 |
 |  string
 |      Convenience property to get the single string within this
 |      PageElement.
 |
 |      TODO It might make sense to have NavigableString.string return
 |      itself.
 |
 |      :return: If this element has a single string child, return
 |       value is that string. If this element has one child tag,
 |       return value is the 'string' attribute of the child tag,
 |       recursively. If this element is itself a string, has no
 |       children, or has more than one child, return value is None.
 |
 |  ----------------------------------------------------------------------
 |  Methods inherited from bs4.element.PageElement:
 |
 |  append(self, tag)
 |      Appends the given PageElement to the contents of this one.
 |
 |      :param tag: A PageElement.
 |
 |  extend(self, tags)
 |      Appends the given PageElements to this one's contents.
 |
 |      :param tags: A list of PageElements.
 |
 |  extract(self)
 |      Destructively rips this element out of the tree.
 |
 |      :return: `self`, no longer part of the tree.
 |
 |  fetchNextSiblings = find_next_siblings(self, name=None, attrs={}, text=None, li
mit=None, **kwargs)
 |
 |  fetchParents = find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |
 |  fetchPrevious = find_all_previous(self, name=None, attrs={}, text=None, limit=N
one, **kwargs)
 |
 |  fetchPreviousSiblings = find_previous_siblings(self, name=None, attrs={}, text=
None, limit=None, **kwargs)
 |
 |  findAllNext = find_all_next(self, name=None, attrs={}, text=None, limit=None, *
*kwargs)
 |
 |  findAllPrevious = find_all_previous(self, name=None, attrs={}, text=None, limit
=None, **kwargs)
 |
 |  findNext = find_next(self, name=None, attrs={}, text=None, **kwargs)
 |
 |  findNextSibling = find_next_sibling(self, name=None, attrs={}, text=None, **kwa
rgs)
 |
 |  findNextSiblings = find_next_siblings(self, name=None, attrs={}, text=None, lim
it=None, **kwargs)
 |
 |  findParent = find_parent(self, name=None, attrs={}, **kwargs)
 |
 |  findParents = find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |
 |  findPrevious = find_previous(self, name=None, attrs={}, text=None, **kwargs)
 |
 |  findPreviousSibling = find_previous_sibling(self, name=None, attrs={}, text=Non
e, **kwargs)
 |
 |  findPreviousSiblings = find_previous_siblings(self, name=None, attrs={}, text=N
one, limit=None, **kwargs)
 |
 |  find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Find all PageElements that match the given criteria and appear
 |      later in the document than this PageElement.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet containing PageElements.
 |
 |  find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Look backwards in the document from this PageElement and find all
 |      PageElements that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  find_next(self, name=None, attrs={}, text=None, **kwargs)
 |      Find the first PageElement that matches the given criteria and
 |      appears later in the document than this PageElement.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_next_sibling(self, name=None, attrs={}, text=None, **kwargs)
 |      Find the closest sibling to this PageElement that matches the
 |      given criteria and appears later in the document.
 |
 |      All find_* methods take a common set of arguments. See the
 |      online documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)
 |      Find all siblings of this PageElement that match the given criteria
 |      and appear later in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  find_parent(self, name=None, attrs={}, **kwargs)
 |      Find the closest parent of this PageElement that matches the given
 |      criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :kwargs: A dictionary of filters on attribute values.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_parents(self, name=None, attrs={}, limit=None, **kwargs)
 |      Find all parents of this PageElement that match the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous(self, name=None, attrs={}, text=None, **kwargs)
 |      Look backwards in the document from this PageElement and find the
 |      first PageElement that matches the given criteria.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs)
 |      Returns the closest sibling to this PageElement that matches the
 |      given criteria and appears earlier in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwar
gs)
 |      Returns all siblings to this PageElement that match the
 |      given criteria and appear earlier in the document.
 |
 |      All find_* methods take a common set of arguments. See the online
 |      documentation for detailed explanations.
 |
 |      :param name: A filter on tag name.
 |      :param attrs: A dictionary of filters on attribute values.
 |      :param text: A filter for a NavigableString with specific text.
 |      :param limit: Stop looking after finding this many results.
 |      :kwargs: A dictionary of filters on attribute values.
 |      :return: A ResultSet of PageElements.
 |      :rtype: bs4.element.ResultSet
 |
 |  format_string(self, s, formatter)
 |      Format the given string using the given formatter.
 |
 |      :param s: A string.
 |      :param formatter: A Formatter object, or a string naming one of the standar
d formatters.
 |
 |  formatter_for_name(self, formatter)
 |      Look up or create a Formatter for the given identifier,
 |      if necessary.
 |
 |      :param formatter: Can be a Formatter object (used as-is), a
 |          function (used as the entity substitution hook for an
 |          XMLFormatter or HTMLFormatter), or a string (used to look
 |          up an XMLFormatter or HTMLFormatter in the appropriate
 |          registry.
 |
 |  insert(self, position, new_child)
 |      Insert a new PageElement in the list of this PageElement's children.
 |
 |      This works the same way as `list.insert`.
 |
 |      :param position: The numeric position that should be occupied
 |         in `self.children` by the new PageElement.
 |      :param new_child: A PageElement.
 |
 |  nextGenerator(self)
 |      # Old non-property versions of the generators, for backwards
 |      # compatibility with BS3.
 |
 |  nextSiblingGenerator(self)
 |
 |  parentGenerator(self)
 |
 |  previousGenerator(self)
 |
 |  previousSiblingGenerator(self)
 |
 |  replaceWith = replace_with(self, replace_with)
 |
 |  replaceWithChildren = unwrap(self)
 |
 |  replace_with(self, replace_with)
 |      Replace this PageElement with another one, keeping the rest of the
 |      tree the same.
 |
 |      :param replace_with: A PageElement.
 |      :return: `self`, no longer part of the tree.
 |
 |  replace_with_children = unwrap(self)
 |
 |  setup(self, parent=None, previous_element=None, next_element=None, previous_sib
ling=None, next_sibling=None)
 |      Sets up the initial relations between this element and
 |      other elements.
 |
 |      :param parent: The parent of this element.
 |
 |      :param previous_element: The element parsed immediately before
 |          this one.
 |
 |      :param next_element: The element parsed immediately before
 |          this one.
 |
 |      :param previous_sibling: The most recently encountered element
 |          on the same level of the parse tree as this one.
 |
 |      :param previous_sibling: The next element to be encountered
 |          on the same level of the parse tree as this one.
 |
 |  unwrap(self)
 |      Replace this PageElement with its contents.
 |
 |      :return: `self`, no longer part of the tree.
 |
 |  wrap(self, wrap_inside)
 |      Wrap this PageElement inside another one.
 |
 |      :param wrap_inside: A PageElement.
 |      :return: `wrap_inside`, occupying the position in the tree that used
 |         to be occupied by `self`, and with `self` inside it.
 |
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from bs4.element.PageElement:
 |
 |  next
 |      The PageElement, if any, that was parsed just after this one.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  next_elements
 |      All PageElements that were parsed after this one.
 |
 |      :yield: A sequence of PageElements.
 |
 |  next_siblings
 |      All PageElements that are siblings of this one but were parsed
 |      later.
 |
 |      :yield: A sequence of PageElements.
 |
 |  parents
 |      All PageElements that are parents of this PageElement.
 |
 |      :yield: A sequence of PageElements.
 |
 |  previous
 |      The PageElement, if any, that was parsed just before this one.
 |
 |      :return: A PageElement.
 |      :rtype: bs4.element.PageElement
 |
 |  previous_elements
 |      All PageElements that were parsed before this one.
 |
 |      :yield: A sequence of PageElements.
 |
 |  previous_siblings
 |      All PageElements that are siblings of this one but were parsed
 |      earlier.
 |
 |      :yield: A sequence of PageElements.
 |
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from bs4.element.PageElement:
 |
 |  __dict__
 |      dictionary for instance variables (if defined)
 |
 |  __weakref__
 |      list of weak references to the object (if defined)
 |
 |  nextSibling
 |
 |  previousSibling

>>>

beautiful-find_all用法

积累程序-爬虫-beautiful-find_all用法

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
##
##items=soup.find_all(class_="h")
##for i in items:
##    print(i)



items2=soup.find_all(class_="item")

for iTag in items2:
    for i in iTag.find_all():
        print(i)

request-beautifulsoup区别

import requests
file1=requests.get("https://www.pku.edu.cn")

file1.encoding="utf-8"
data=file1.text

myFile=open(r"E:\pkuCode.txt","w",encoding="utf-8")
print(data,file=myFile)

myFile.close()

'''
soup的数据类型是<class 'bs4.BeautifulSoup'>，说明soup是一个BeautifulSoup对象
打印的soup，是所请求网页的完整HTML源代码
虽然response.text和soup打印出的内容表面上看长得一模一样，却有着不同的内心，它们属于不同的类：<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串，后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本，是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的str方法，所以直接打印 bs 对象显示字符串是str的返回结果
'''

获取网页中文字并有序展现

爬虫-bs-获取北大网页中的网站和文字-并有序展现

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象


items2=soup.find_all(class_="item")

##for iTag in items2:
##    for i in iTag.find_all():
##        myText=i.get_text()
##        print(myText)
##

for everyTag in items2:
    print(everyTag)

    print()
    print("文字部分")
    myText=everyTag.get_text()
    print(myText)

    print()
    print("链接部分")
    myLinks=everyTag.find_all("a")#everyLink是BS 中的tag
    for everyLink in myLinks:
        if "href" in everyLink.attrs:#attrs只有在BS 中tag中才可以用。
            print(everyLink)
    input()

global

变量作用域
一个在函数内部赋值的变量仅能在该函数内部使用（局部作用域），它们被称作局部变量
在所有函数之外赋值的变量，可以在程序的任何位置使用（全局作用域），它们被称作全局变量
如果想将局部变量声明为全局变量，就要用到global语句

tfc = 1000

def tvc():
    global tvc  # global语句一般写在函数体的第一行，它会告诉Python，“我希望tvc是个全局变量，所以请不要用这个名字创建一个局部变量”
    vc = 200
    x = 10
    tvc = vc * x

def tc():
    print(tfc+tvc)  # tc()函数内部现在可以直接使用声明后的全局变量tvc

tvc()
tc()
# 》》3000

match

import re
m=re.match("hello","hellov world")
if m is not None:
    print(m.group())
    
print(m.__class__.__name__)


m=re.match("bird","bird is flying")
print(m.group())

使用soup.prettify() 有序呈现

import requests
import csv
from bs4 import BeautifulSoup as bs

url="https://www.zhihu.com/follow"

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
    'limit': '20',
    'sort_by': 'created'
}

webFile= requests.get(url, params=params, headers=headers)
webFile.encoding="utf-8"
data=webFile.text   

soup=bs(data,"html.parser")
print(soup.prettify())

爬取标签

从网页中爬取标签

从超星、维基、知网、阿帕比网站，Ctrl + S 保存网页后，爬取其中的文本目录信息。可以用如下代码实现操作。

myWord="""
[Images]
[Font]
Language=GBK
FontSize=7
Margin=0.5

[Bkmk]
File=FreePic2Pdf_bkmk.txt
AddAsText=0
ShowBkmk=1
ShowAll=1
BasePage=1

[Main]
ContentsPage=
TextPage=
"""
Head='''
首
\t书名页
\t版权页
\t序言
目录

'''

def test():
    htmlName=str(input("请输入网页Wiki CNKI ChoaXing Apabi文件名称："))

    import requests
    from bs4 import BeautifulSoup as bs

    webFile=open(htmlName,"r",encoding="utf-8")
    data=webFile.read()
    webFile.close()

    mysoup=bs(data,"html.parser")
    mysoup.prettify()

    writeFile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")

    print(Head,file=writeFile)

                 
    if "维基文库" in htmlName:
        print("Wiki")
        result=mysoup.find_all("li")
        choice=input("请选择通行A 或 调试T：")
        for i in result:
            myInfo=i.get_text()

            if choice=="A":
                if "卷" in myInfo:
                    mylist=myInfo.split("　")
                    print(mylist[0],file=writeFile)
                    for m in mylist[1:]:
                        print("\t",m,file=writeFile)
            elif choice=="T":
                if "卷" in myInfo:
                    print(myInfo,file=writeFile)
                else:
                    print("\t",myInfo,file=writeFile)
    elif "阿帕比" in htmlName:
        print("Apabi")
        result=mysoup.find_all("li")
        for i in result:
            myInfo=i.get_text()
            for word in "()1234567890页":
                myInfo=myInfo.replace(word,"")

            infoList=myInfo.split(" ")
            if len(infoList)>2:#将单个的对象排除。统一切分处理
                print(infoList[1],file=writeFile)
                for m in infoList[2:]:
                        print("\t",m,file=writeFile)
            elif len(infoList)==2:
                print("\t",myInfo,file=writeFile)
                
        
    elif "中国知网" in htmlName or "CNKI" in htmlName:
        print("CNKI")
        result=mysoup.find_all(attrs={"class":"catalog-listDiv"})
        if len(result)==0:
            result=mysoup.find_all("li")
        
        for i in result:
            myInfo=i.get_text()
            infoline=myInfo.split("    ")
            for line in infoline:
                if "摘要" in line:
                    nline=line.split(" ")
                    for m in nline:
                        print(m,file=writeFile)
                elif "第" in line and  "章" in line and "节" not in line:
                    wline=line.split(" ")
                    print("\t",wline[0],file=writeFile)
                    for m in wline[1:]:
                        print(m,end="",file=writeFile)
                    print("\n",file=writeFile)

     
                elif "结语 参考文献 致谢" in line:
                    nline=line.split(" ")
                    print(nline[0]+nline[1],file=writeFile)
                    for m in nline[2:]:
                        print(m,file=writeFile)
                else:print("\t",line,file=writeFile)

    else:
        print("ChaoXing")
        result=mysoup.find_all("span")

        for i in result:
            if "node_name" in str(i):
                sen=i.get_text()
                sen=sen.lstrip(" ")
                
                if  "第" in str(i) and "章" in str(i):
                    print(sen,file=writeFile)
                elif  "第" in str(i) and "讲" in str(i):
                    print(sen,file=writeFile)
                elif "卷" in str(i) or "论" in str(i) or "编" in str(i):
                    for hz in "一二三四五六七八九十":
                        if hz in str(i):
                            print(sen,file=writeFile)
                            break
                    else:print("\t",sen,file=writeFile)

                else:
                    print("\t",sen,file=writeFile)


    print("尾",file=writeFile)
    writeFile.close()

    itfFile=open("FreePic2Pdf.itf","w",encoding="utf-8")
    print(myWord,file=itfFile)
    itfFile.close()

即可完成。

从文本中获取标签

可以使用如下代码：

def test():
    import re
    pattern="“.*?[。？：；”]"

    fileName=input("选择句子开头作为标签，请输入文本名称：")#说文解字，尔雅

    part=input("请输入1或2个区分层级关键词{第部章卷...}：")
    if len(part)==1:
        a=part
        b=part
    elif len(part)==2:
        a=part[0]
        b=part[1]
    choice="L"
    choice=input("文本对话选L；Wiki目录选W；开头首字母选S；开头前面句子选E：")
    choice=choice.upper()

    file=open(fileName,"r",encoding="utf-8")
    data=file.read()
    file.close()

    data=data.replace("编辑","")
    datalines=data.splitlines()

    def ShuoWen():
        #说文
        for line in datalines:
            for word in line:
                if word in "（（ ）0123456789：↑":
                    break
            print("\t",word,file=wfile)
    def ErYa():
        for line in datalines:
            if part in line:
                print(line,file=wfile)
            else:print("\t",line[:5],file=wfile)
    def Wiki():
        for line in datalines:
            if part in line and len(line)<=4 and len(line)>=2:
                print(line,file=wfile)
            elif "↑" in line or "◄" in line or "►" in line or " 註釋"  in line:pass
            elif len(line)>=2 and len(line)<=10:
                print("\t",line,file=wfile)            
    def LunYu():
        zhang=0
        
        jieming=0
        for line in datalines:
            if a in line and b in line:
                print(line,file=wfile)
                zhang+=1
                jieming=1

            if a not in line and b not in line and len(line)>4:#【经验】if ...if...和if ... else...不同。前者是单线，后者是双线。
                result=re.compile(pattern).findall(line)
                print("\t",f"{zhang}.{jieming}",end="",file=wfile)

                if len(result)!=0:#选择引号内的句子。
                    jieming+=1
                    n=0
                    for i in result:
                        i=i.lstrip("“")
                        print(i,file=wfile)
                        n+=1
                        if n==1:
                           break                    
                else:#没有引号则选择开头句子
                    jieming+=1
                    for w in line:
                        print(w,end="",file=wfile)
                        if w in "：。；":
                            break
                print("\n",file=wfile)
            
            

    wfile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
    if choice=="S":
        ShuoWen()
    elif choice=="E":
        ErYa()
    elif choice=="W":
        Wiki()
    elif choice=="L":
        LunYu()
    wfile.close()
    print("已经完成")

即可实现。

爬取文本

爬虫实践从wiki中下载文本

def test():
    import requests
    from bs4 import BeautifulSoup as bs
    import time
    import random
    import re 

    webUrl=input("请输入书籍所在的维基网址:")
    infoList=webUrl.split("/")
    articleName=infoList[-1]
    

    startTime=time.time()


    writeFile=open(f"{articleName}.txt","a",encoding="utf-8")
    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"
    
    data=webFile.text

    obs=bs(data,"html.parser")
    obs.prettify()
    resultLink=obs.find_all("li")

    webList=[]
    for link in resultLink:
        if articleName in str(link):
            iname=link.get_text()
            iweb=webUrl+"/"+iname
            webList.append(iweb)

    for iweb in webList:
        print(iweb)
        iFile=requests.get(iweb)
        iFile.encoding="utf-8"
        idata=iFile.text
        iobs=bs(idata,"html.parser")
        iobs.prettify()

        result0=iobs.find_all(attrs={"class":"section-heading"})


##        result1=iobs.find_all("section")
##        print(result1)
        
        result1=iobs.find_all(attrs={"class":"mw-parser-output"})
##        for i in result1:
##            print(i.get_text(),file=writeFile)
##
        if len(result0)!=0:
            result1.pop(0)#如果开头标题有多余信息，则使用这个软件    
            xy=zip(result0,result1)
            for i in xy:
                print(i[0].get_text()[:-2],file=writeFile)#下载《春秋左传正义》的时候用了这个程序
                print(i[1].get_text(),file=writeFile)
                
        else:
            for i in result1:
                print(i.get_text(),file=writeFile)#下载《史记三家注》用了这个程序
                
        time.sleep(0.05+random.randint(0,2))
    writeFile.close()

    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时：","{0:4.2}".format(long),"分钟。")

对于代码，还可以进一步优化。

def test():
    import requests
    from bs4 import BeautifulSoup as bs
    import time
    import random
    import re 

    webUrl=input("请输入书籍所在的维基网址:")
    infoList=webUrl.split("/")
    articleName=infoList[-1]
    

    startTime=time.time()


    writeFile=open(f"{articleName}.txt","a",encoding="utf-8")
    webFile=requests.get(webUrl)
    webFile.encoding="utf-8"
    
    data=webFile.text

    obs=bs(data,"html.parser")
    obs.prettify()
    resultLink=obs.find_all("li")

    webList=[]#需要依据实际情况调整章节的网络链接格式
    for link in resultLink:
        if articleName in str(link):
            iname=link.get_text()
            iweb=webUrl+"/"+iname
            webList.append(iweb)#有的网站是“卷01”，不按照链接体现的格式。这个就得调整程序了。
    for iweb in webList:
        print(iweb)
        iFile=requests.get(iweb)
        iFile.encoding="utf-8"
        idata=iFile.text
        iobs=bs(idata,"html.parser")
        iobs.prettify()

        result0=iobs.find_all(attrs={"class":"section-heading"})


##        result1=iobs.find_all("section")
##        print(result1)
        
        result1=iobs.find_all(attrs={"class":"mw-parser-output"})
##        for i in result1:
##            print(i.get_text(),file=writeFile)
##
        if len(result0)!=0:
            result1.pop(0)#如果开头标题有多余信息，则使用这个软件    
            xy=zip(result0,result1)
            for i in xy:
                print(i[0].get_text()[:-2],file=writeFile)#下载《春秋左传正义》的时候用了这个程序
                print(i[1].get_text(),file=writeFile)
                
        else:
            for i in result1:
                print(i.get_text(),file=writeFile)#下载《史记三家注》用了这个程序
                
        time.sleep(0.05+random.randint(0,2))
    writeFile.close()


    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时：","{0:4.2}".format(long),"分钟。")
test()

爬虫实践从zdic中下载文本

import requests
from bs4 import BeautifulSoup as bs
import time
import random
import re 

def test():
    a=int(input("请输入汉典网页起始页码："))
    b=int(input("请输入汉典网页终止页码："))

    myName=input("请输入目标文件名：")

    startTime=time.time()
    HouZhui=".docx"
    resultName=myName+HouZhui




    urlList=[]

    for i in range(a,b+1):
        webUrl="https://gj.zdic.net/archive.php?aid-"+str(i)+".html"
        urlList.append(webUrl)

    zongShu=len(urlList)
    n=1


    writeFile=open(resultName,"w",encoding="utf-8")

    for webUrl in urlList:
        webfile=requests.get(webUrl)
        webfile.encoding="utf-8"
        data=webfile.text
        
        obs=bs(data,"html.parser")
        obs.prettify()
        title=obs.title

        for i in title:
            print("\n",file=writeFile)
            print(i,file=writeFile)
            print("★",file=writeFile)

        result=obs.find_all(attrs={"id":"snr2"})
        art=str(result)
        artlines=art.splitlines()
        article=artlines[0][17:]
        article=article.replace("<br/>","s")
        for i in article:
            
            if i=="s":
                print("\n",file=writeFile)
                print("\t",file=writeFile)
            else:print(i,end="",sep="",file=writeFile)
        print("……",file=writeFile)
        print("\n",file=writeFile)
        time.sleep(0.05+random.randint(0,2))
        percent=float(n/zongShu)
        print(f"第{n}页已完成，共计{zongShu}页,完成度","{0:4.2}".format(percent))
        n+=1
    writeFile.close()
        
    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时：","{0:4.2}".format(long),"分钟。")

爬虫实践从ctext 中下载文本

从ctext 中下载文本。可以用到ctext包。

ctext 相关说明如下

https://pypi.org/project/ctext/

下面以《论语》为例，说明如何下载。

代码如下：

from ctext import *
setapikey("your-api-key-goes-here")
setlanguage("zh")

stats = getstats()
status = getstatus()
titles = gettexttitles()
capabilities = getcapabilities()

urn = readlink("https://ctext.org/analects")#以论语为例

passages = gettext("ctp:analects/xue-er")
print(passages)

又有如下程序，亦可以实现功能。

def test():
    '''
    https://ctext.org/wiki.pl?if=gb&chapter=868712
    https://ctext.org/wiki.pl?if=gb&chapter=969206
    webUrl="https://ctext.org/wiki.pl?if=gb&res=970278
    '''
    import requests
    from bs4 import BeautifulSoup as bs

    import time
    import random

    headers={}#建立字典
    user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
                    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
                    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
                    ]



    startTime=time.time()
    webUrl=input("请输入书本在Ctext中目录所在网址：")#目录页所在编码，可以获得每章的链接
    ##webUrl="https://ctext.org/wiki.pl?if=gb&res=642006"
    startPage=int(input("请输入目录列表中所求链接的序列数字："))
    webfile=requests.get(webUrl)
    webfile.encoding="utf-8"
    data=webfile.text

    obs=bs(data,"html.parser")
    obs.prettify()


    result=obs.find_all("a")
    Name=obs.h2
    nameStr=Name.get_text()
    nameList=nameStr.split("[")
    resultName=nameList[0]




    urlList=[]
    for i in result:
        if "wiki.pl?" and "卷" in str(i):
            url=list(i.attrs.values())

            webLink="https://ctext.org/"+url[0]
            urlList.append(webLink)
        elif "wiki.pl?" and "序" in str(i):
            url=list(i.attrs.values())
            webLink="https://ctext.org/"+url[0]
            urlList.append(webLink)


    numList=[str(i) for i in range(0,10)]

    zongShu=len(urlList)


    n=0
    writeFile=open(f"{resultName}_FromCtext.txt","a+",encoding="utf-8")

    start=startPage-1
    for webUrl in urlList[start:]:#列表从0开始
        headers['User-Agent']= random.choice(user_agent_list)

        print(webUrl)
        webfile=requests.get(webUrl,headers=headers)
        webfile.encoding="utf-8"
        data=webfile.text
        
        obs=bs(data,"html.parser")
        obs.prettify()
        title=obs.title

        for i in title:
            print(i,file=writeFile)
            print("★",file=writeFile)

        result=obs.find_all(class_="ctext")
        for i in result:
            myStr=i.get_text()
            for num in numList:
                myStr=myStr.replace(num,"")
            print(myStr,file=writeFile)
        n+=1
            

        time.sleep(3+random.randint(0,3))
        percent=float((n+start)/zongShu)
        print(f"第{n+start}页已完成，共计{zongShu}页,完成度","{0:4.2}".format(percent))

        
        
    endTime=time.time()
    long=(endTime-startTime)/60
    print("总记时：","{0:4.2}".format(long),"分钟。")
    writeFile.close()

三者结合

想要使得三者结合，有如下代码：

'''
https://gj.zdic.net/archive.php?aid-6679.html


'''
def test():
    webChoice=input("汉典:Z；维基:W；哲学电子书:C。请输入选择：")
    webChoice=webChoice.upper()
    if webChoice=="Z":
        import BsFromZdic
        BsFromZdic.test()
    elif webChoice=="W":
        import BsFromWik
        BsFromWik.test()
    elif webChoice=="C":
        import BsFromCtext
        BsFromCtext.test()

爬取图片

贴吧中的动画图片

《虹猫蓝兔七侠传》是一部非常不错的动画片，后续还有漫画版的前传和后传。百度贴吧中，有这样一系列图片，现在想把图片爬下来，合成PDF便于阅读。写如下代码：

'''
<img class="BDE_Image" src="https://imgsa.baidu.com/forum/w%3D580/sign=fbff
fefc1f950a7b75354ecc3ad3625c/4c5fa44bd11373f035f5ca55a60f4bfbf9ed04ca.jpg" pic_ext="jpeg" pic_type="0"
width="560" height="388">

<img class="BDE_Image" src="https://imgsa.baidu.com/forum/w%3D580/sign=efda23249e82d158bb8259b9b00
819d5/acb1c2ef76094b36927cbe27a1cc7cd98c109d2e.jpg"
pic_ext="jpeg" pic_type="0" width="560" height="426">

<img class="image_original_original"
style="z-index: 2; width: 585.176px; height: 450px; top: 0px; left: 75.9122px;"
src="http://imgsrc.baidu.com/forum/pic/item/f82405d7912397dd928f3cce5b82b2b7d1a28726.jpg">

'''



urlList=["http://tieba.baidu.com/p/3175345087",
         "http://tieba.baidu.com/p/3175362317",
         "http://tieba.baidu.com/p/3175373350",
         "http://tieba.baidu.com/p/3175383386",
        "http://tieba.baidu.com/p/3175393635",
        "http://tieba.baidu.com/p/3175402697",]

import urllib.request
import re

zhang=1
for webUrl in urlList:

    i=1
    htmll=urllib.request.urlopen(webUrl).read()

    data=str(htmll)


    pattern='''img class="image_original_original" src=.(.+?\.jpg)"'''
    result=re.compile(pattern).findall(data)

    for imageUrl in result:
        print(imageUrl)
##        print(imageUrl)
##        imageName=str(zhang)+"-"+str(i)+".jpg"
##        i=i+1
##        urllib.request.urlretrieve(imageUrl,filename=imageName)
##    zhang=zhang+1
##    
    print()

注意：爬取后的图片，有模糊图和高清图两种，名称并不一样。高清图的位置不在页面中而是在别处，需要修正一下网络链接才能爬取。

那么，能否从中找到共性，写成通用的代码呢？于是做了如下尝试：

def test():
    '''
    http://imgsrc.baidu.com/forum/pic/item/e69597510fb30f24ebcb4ec9ca95d143ac4b0347.jpg
    http://imgsrc.baidu.com/forum/pic/item/4c0f7af082025aaf165fdc01f9edab64024f1aa3.jpg

    '''
    import urllib.request
    import requests
    from bs4 import BeautifulSoup as bs
    import re
    print("每个网站的情况并不一致，借鉴此程序后，重新写代码为宜。")
    mychoice=input("是否继续 Y or N：")
    if mychoice=="Y":
        pass
    else:
        exit()
        
    print("如为避免遗漏而需下载网页，请复制网页代码到web.html并输入D。")
    print("如在网上运行，请输入W。")  
    choice=input("Download or Web：")


    webUrlList=[]
    while True:
        webUrl=input("请输入要下载图片所在的完整网站：")
        webUrlList.append(webUrl)
        webChoice=input("是否继续输入网站，Y or N：")
        if webChoice=="N":
            break

    ##webUrl="https://baike.baidu.com/pic/黑小虎传奇/4659511"#点击进入百度黑小虎传奇图册。
    adjust=input("是否需要调整高清图，Y or N：")

    classImage=str(input("请输入obs寻找到的class类别："))

    pattern='src="..*?"'
    
    zhang=1

    if choice=="D" and adjust=="N":
        myfile=open("web.html","r",encoding="utf-8")
        data=myfile.read()
        myfile.close()
        obs=bs(data,"html.parser")
        result=obs.find_all(attrs={"class":classImage})
        n=1
        for i in result:
            myLink=re.findall(pattern,str(i))
            bLink=str(myLink[0])
            print(bLink)
            imageName="图"+str(n)+".jpg"
            urllib.request.urlretrieve(bLink,filename=imageName)
            n+=1
        zhang+=1

    elif choice=="D" and adjust=="Y":

        addLink="watermark,image_d2F0ZXIvYmFpa2UxODA=,g_7,xp_5,yp_5/format,f_auto"
        myfile=open("web.html","r",encoding="utf-8")
        data=myfile.read()
        myfile.close()
        obs=bs(data,"html.parser")
        result=obs.find_all("img")
        n=1
        
        for i in result:
            try:
        ##        print(i)
                myLink=re.findall(pattern,str(i))

                aLink=myLink[0]
                aList=aLink.split("/")
                aLink=aList[2][:-1]#需要依据实际情况不断调整。
        ##        print(aList)
                bLink=f"https://bkimg.cdn.bcebos.com/pic/{aLink}?x-bce-process=image/{addLink}"
        ####        bLink=aList[-1]#通过观察，找到更为清晰的图片链接。
                print(bLink)

                imageName="图"+str(n)+".jpg"
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            except:pass
        zhang+=1


    elif choice=="W" and adjust=="Y":
        addLink=input("请根据情况输入图片网址的前半部分：")
        #"http://imgsrc.baidu.com/forum/pic/item/"
        for webUrl in webUrlList:
            html=requests.get(webUrl)
            html.encoding="utf-8"
            data=html.text
            obs=bs(data,"html.parser")
            obs.prettify()
            result=obs.find_all(attrs={"class":classImage})
            n=1
            for i in result:
                print(i)
                myLink=re.findall(pattern,str(i))#bs是用find_all,而re使用findall
                print(myLink)
                aLink=myLink[0]
                aList=aLink.split("/")
                bLink=addLink+aList[-1]#通过观察，找到更为清晰的图片链接。
                print(bLink)
                imageName=str(zhang)+"图"+str(n)+".jpg"        
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            zhang+=1

    elif choice=="W" and adjust=="N":
        zhang=1
        for webUrl in webUrlList:
            html=requests.get(webUrl)
            html.encoding="utf-8"
            data=html.text
            obs=bs(data,"html.parser")
            obs.prettify()
            result=obs.find_all(attrs={"class":classImage})

            n=1
            for i in result:
                myLink=re.findall(pattern,str(i))
                bLink=str(myLink[0])
                print(bLink)
                imageName="图"+str(n)+".jpg"
                urllib.request.urlretrieve(bLink,filename=imageName)
                n+=1
            zhang+=1

    else:print("未能导出图片，请进一步完善程序。")

网站中的地理图片

国家地理网站中，有一些图片也可以进行爬取。


# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"

def inputFile():
    f=open("nationalgeographic.htm","r",encoding="utf-8")
    ls=f.readlines()
    f.close()
    print(ls)
    #<img alt="火山口" src="http://image.ngchina.com.cn/2019/1104/20191104100458321.jpg">

    urls = []
    for line in ls:
        if 'img' in line:
            url = line.split('src=')[-1].split('"')[1]  # 这里得研究一下
            if 'http' in url:
                urls.append(url)
    #print(urls)
    return urls
#inputFile()


def showResults():
    urls=inputFile()
    f=open("result.txt","w",encoding="utf-8")
    count = 0
    for url in urls:
        print("第{}个URL：{}".format(count, url),file=f)
        print("第{}个URL：{}".format(count, url))
        count += 1
    f.close()

showResults()

注意，爬取时候要获得图片的网页链接


# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"

def inputFile():
    f=open("file1.htm","r",encoding="utf-8")
    ls=f.readlines()
    f.close()
    print(ls)

    urls = []
    for line in ls:
        if 'img' in line:
            url = line.split('src=')[-1].split('"')[1]  # 这里得研究一下
            if 'http' in url:
                urls.append(url)
    #print(urls)
    return urls
#inputFile()


def showResults():
    urls=inputFile()
    f=open("result.txt","w",encoding="utf-8")
    count = 0
    for url in urls:
        print("第{}个URL：{}".format(count, url),file=f)
        print("第{}个URL：{}".format(count, url))
        count += 1
    f.close()

showResults()

网站中的人物图片

在学习爬虫的时候，借鉴了网友的代码，如下所示，其中在运行过程中，发现会有bug需要修正。在一步一步运行代码并修正的时候，也对爬虫有了更深入的了解。

import requests

url="http://www.runoob.com"


header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
#设置headers，网站会根据这个判断你的浏览器及操作系统，很多网站没有此信息将拒绝你访问
#用get方法打开url并发送headers
html = requests.get(url,headers = header)
#print(html.text)


#提取所需要的信息
##将获取的源码转换为BeautifulSoup对象
##使用find搜索需要的数据，保存到容器中
from bs4 import BeautifulSoup

url='http://www.mzitu.com'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
html=requests.get(url,headers=header)
#print(html.text)

 
#使用自带的html.parser解析，速度慢但通用
soup = BeautifulSoup(html.text,'html.parser')


#寻找div中的所有a
all_a=soup.find("div",class_="postlist").find_all("a",target="_blank")
##for a in all_a:
##    title=a.get_text()#提取文本
##    print(title)
##
##
##all_div=soup.find("div",class_="postlist")
##for i in all_div:
##    tmp=i.get_text()
##    print(tmp)
#find 返回类型和find_all返回类型不同，find_all才能用get_text()

##page = soup.find_all('a', class_='page-numbers')
##max_page = page[-2].text
###print(max_page)

picture=soup.find("div",class_='postlist').find_all("a",target="_blank")
for everylink in picture:
    #print(everylink)
    
    tmp=everylink.attrs
    #print(tmp)
    
    mytxt=everylink.get_text()
    
    if "href" in everylink.attrs:
        print(f"href={everylink.attrs['href']}",sep="\t")
    
#print(picture)


# same_url = 'http://www.mzitu.com/page/'   # 主页默认最新图片
# 获取每一类MM的网址
##same_url = 'https://www.mzitu.com/mm/page/'
##
## 
##for n in range(1, int(max_page) + 1):
##    ul = same_url + str(n)
##    #print(ul)
##    # 分别对当前类每一页第一层url发起请求
##    start_html = requests.get(ul, headers=header)
##    # 提取所有MM的标题
##    soup = BeautifulSoup(start_html.text, "html.parser")
##    all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
##    #print(all_a)
##    
##    # 遍历所有MM的标题
##    for a in all_a:
##        # 提取标题文本，作为文件夹名称
##        title = a.get_text()
##        print(title)
##        if(title != ''):
##            print("准备扒取：" + title)
##            if(oa.path.exists(path+title.strip()))
##            
## 
##

于是，经过试错，不断修正，完善为如下代码：

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
 
all_url = 'https://www.mzitu.com'
 
# http请求头
Hostreferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://www.mzitu.com'
}
# 此请求头Referer破解盗图链接
Picreferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://i.meizitu.net'
}
 
# 对mzitu主页all_url发起请求，将返回的HTML数据保存，便于解析
start_html = requests.get(all_url, headers=Hostreferer)
 
# Linux保存地址
# path = '/home/Nick/Desktop/mzitu/'
 
# Windows保存地址
path = 'E:/mzitu/'
 
# 获取最大页数
soup = BeautifulSoup(start_html.text, "html.parser")
page = soup.find_all('a', class_='page-numbers')
max_page = page[-2].text
 
 
# same_url = 'http://www.mzitu.com/page/'   # 主页默认最新图片
# 获取每一类MM的网址
same_url = 'https://www.mzitu.com/mm/page/'     # 也可以指定《qingchun MM系列》
 
for n in range(1, int(max_page) + 1):
    # 拼接当前类MM的所有url
    ul = same_url + str(n)
 
    # 分别对当前类每一页第一层url发起请求
    start_html = requests.get(ul, headers=Hostreferer)
 
    # 提取所有MM的标题
    soup = BeautifulSoup(start_html.text, "html.parser")
    all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
 
    # 遍历所有MM的标题
    for a in all_a:
        # 提取标题文本，作为文件夹名称
        title = a.get_text()
        if(title != ''):
            print("准备扒取：" + title)
 
            # windows不能创建带？的目录，添加判断逻辑
            if(os.path.exists(path + title.strip().replace('?', ''))):
                # print('目录已存在')
                flag = 1
            else:
                os.makedirs(path + title.strip().replace('?', ''))
                flag = 0
            # 切换到上一步创建的目录
            os.chdir(path + title.strip().replace('?', ''))
 
            # 提取第一层每一个MM的url，并发起请求
            href = a['href']
            html = requests.get(href, headers=Hostreferer)
            mess = BeautifulSoup(html.text, "html.parser")
 
            # 获取第二层最大页数
            pic_max = mess.find_all('span')
            pic_max = pic_max[9].text
            if(flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max)):
                print('已经保存完毕，跳过')
                continue
 
            # 遍历第二层每张图片的url
            for num in range(1, int(pic_max) + 1):
                # 拼接每张图片的url
                pic = href + '/' + str(num)
 
                # 发起请求
                html = requests.get(pic, headers=Hostreferer)
                mess = BeautifulSoup(html.text, "html.parser")
                pic_url = mess.find('img', alt=title)
                print(pic_url['src'])
                html = requests.get(pic_url['src'], headers=Picreferer)
 
                # 提取图片名字
                file_name = pic_url['src'].split(r'/')[-1]
 
                # 保存图片
                f = open(file_name, 'wb')
                f.write(html.content)
                f.close()
            print('完成')
    print('第', n, '页完成')

个人图书馆中的学习图片

在360doc中，有些图片很利于学习，如何爬取呢？

写如下代码：

import urllib.request
import requests
from bs4 import BeautifulSoup as bs
import re

 
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
 
    'limit': '20',
    'sort_by': 'created'
}

 
webUrl="http://www.360doc.com/showweb/0/0/1104723360.aspx"

webFile=requests.get(webUrl)
webFile.encoding="utf-8"
data=webFile.text
print(data)

运行后，返回效果如下

<html>
<head><title>403 Forbidden</title></head>
<body bgcolor="white">
<center><h1>403 Forbidden</h1></center>
<hr><center>nginx</center>
</body>
</html>

推测原因，是你需要登录账号，才可以查看。

403 Forbidden是HTTP协议中的一个状态码(Status Code)。可以简单的理解为没有权限访问此站。该状态表示服务器理解了本次请求但是拒绝执行该任务，该请求不该重发给服务器。

既然这样，直接Ctrl + S，保存该网页所有信息。该网页图片也会全部保存下来，反而更有效率。

【心得】

爬取综合信息

网站中的邮箱号码

对于网易中的邮箱号码，也可以进行爬取。



        
def Gupiao():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))

    #输出字符串的前面信息，便于观察整个网站构成
    print(mycontent[:200])
    print()


    #寻找需要的信息，区分不同的语法
    def Find():        
        myinfor=mysoup.find_all("a")
        for i in myinfor:
            tmp=i.get_text()
            print(tmp)
            print(i)
            print()
            print(i.prettify())
        #print(myinfor)

            
    #将需要的网站输出
    def Wangzhan():
        urlsList=[]
        myinfor=mysoup.find_all("a")
        for line in myinfor:
            #print(line)
            
            tmp=str(line)#line的类型是<class 'bs4.element.Tag'>
            
            
            if "http" in tmp:
                url=tmp.split('"')#将长的字符串切分，留下网站
                urlsList.append(url[1])
                
                print(line.get_text())#获得网站的标题
                print(url[1])#输出网站字符串
            
    Wangzhan()
        
            
Gupiao()

     
def Ceyan():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))
    print(mycontent[:500])
    print()
    
    myinfor=mysoup.find_all("a")#<a href="http://www.163.com/">网易首页</a>,寻找的是属性，如a,如td,如tr，head,body,
    for i in myinfor:
        tmp=i.get_text()
        print(tmp)
        print(i)
        print()
        print(i.prettify())#用不同的语法格式，看看输出的效果如何。然后就知道各个语句的用法何在。prettify的作用是把密密麻麻的一行输出为整齐的几行，便于阅读。
    #print(myinfor)

Ceyan()

在实践中，也会遇到问题。可以调整代码解决：

from bs4 import BeautifulSoup
import time

htmlFile=open("stock1.html","r",encoding="utf-8")
htmlContent=htmlFile.read()

#time.sleep(10) #暂停10秒

myBS=BeautifulSoup(htmlContent,"html.parser")
#print(myBS)
myLinks=myBS.find_all("a")
#print(myLinks)

for everyLink in myLinks:
	myText=everyLink.get_text()
	#print(myText)

	if "163.com" not in myText:
		print("test")
		print(myText)
		if "href" in everyLink.attrs:#属性attrs
			print(f"{myText}:href={everyLink.attrs['href']}",sep="\t")
			#print(myText,":href=",everyLink.attrs['href'],sep="\t")


'''
myBs=BeautifulSoup(htmlcontent,''html.parser'')
mylinks=myBs.find_all('a')

for everylink in mylinks:
	mytext=everylink.get_text()
	if '163.com' not in mytext:
		if"href" in everylink.attrs:
			print(mylink)

问题：为什么163.com还会出现呢？
运行结果：
网易首页:href=http://www.163.com/
新闻:href=http://news.163.com/
体育:href=http://sports.163.com/
NBA:href=http://sports.163.com/nba/
娱乐:href=http://ent.163.com/
财经:href=http://money.163.com/
股票:href=http://money.163.com/stock/
汽车:href=http://auto.163.com/
科技:href=http://tech.163.com/
'''

爬取之后，有序输出。

网站中的大学信息

读取网络文本


import requests
webFile=requests.get("http://www.pku.edu.cn")
webFile.encoding="utf-8"
webFile=webFile.text
print(webFile)

解析网页

import requests
response=requests.get('https://www.pku.edu.cn')
mycode=response.status_code
mycontent=response.content

分析所爬内容


with open(r"E:\pkuCode.txt","r",encoding="utf-8") as myFile:
    data=myFile.readlines()

myList=list(data)
for i in myList:
    print(i)
    input()

解析对象

import requests
from bs4 import BeautifulSoup as bs


webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串

soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
soup.prettify()

items2=soup.find_all(class_="item")


myFile=open(r"E:\mySchoolLink.txt","w",encoding="utf-8")

for everyTag in items2:
    #print(everyTag)

    print(file=myFile)
    print("文字部分",file=myFile)
    myText=everyTag.get_text()
    print(myText,file=myFile)

    print(file=myFile)
    print("链接部分",file=myFile)
    myLinks=everyTag.find_all("a")#everyLink是BS 中的tag
    for everyLink in myLinks:
        if "href" in everyLink.attrs:#attrs只有在BS 中tag中才可以用。
            print(everyLink.attrs,file=myFile)

myFile.close()

可以用requests ,将对象存储下来：

import requests
file1=requests.get("https://www.pku.edu.cn")

file1.encoding="utf-8"
data=file1.text

myFile=open(r"E:\pkuCode.txt","w",encoding="utf-8")
print(data,file=myFile)

myFile.close()


'''
soup的数据类型是<class 'bs4.BeautifulSoup'>，说明soup是一个BeautifulSoup对象
打印的soup，是所请求网页的完整HTML源代码
虽然response.text和soup打印出的内容表面上看长得一模一样，却有着不同的内心，它们属于不同的类：<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串，后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本，是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的str方法，所以直接打印 bs 对象显示字符串是str的返回结果
'''

网站中的音乐

import requests
from bs4 import BeautifulSoup as bs

headers = {
    'origin':'https://y.qq.com',
    # 请求来源，本案例中其实是不需要加这个参数的，只是为了演示
    'referer':'https://y.qq.com/n/yqq/song/004Z8Ihr0JIu5s.html',
    # 请求来源，携带的信息比“origin”更丰富，本案例中其实是不需要加这个参数的，只是为了演示
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    # 标记了请求从什么设备，什么浏览器上发出
    }
# 伪装请求头


url1='''
https://c.y.qq.com/soso/fcgi-bin/client_search_cp?
ct=24&qqmusic_ver=1298&
new_json=1&remoteplace=txt.yqq.song&
searchid=57068364391640558&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&
'''
url2="p=1"

url3="""
&n=10&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0
"""
#注意，这个网址是在网页检查过程中找到并复制的，针对文本所在内容的网址，而不是qq音乐的官网。详情参看风变编程笔记。

for i in range(1,10):
    url=url1+"p="+str(i)+url3
    webFile=requests.get(url)
    webFile.encoding="utf-8"
    data=webFile.text

    jsonMusic=webFile.json()
    listMusic = jsonMusic['data']['song']['list']
    
    for i in listMusic:
        print("专辑名：",i["albumname"])
        print("歌曲名：",i["songname"])
        print('播放时长：'+str(i['interval'])+'秒')
        print('播放链接：https://y.qq.com/n/yqq/song/'+i['media_mid']+'.html\n\n')

import requests
from bs4 import BeautifulSoup as bs

myHref="https://y.qq.com/n/yqq/singer/0025NhlN2yWrP4.html"
webFile=requests.get(myHref)
data=webFile.text

soup=bs(data,"html.parser")




print("""class_=js_song""")
items1=soup.find_all(class_="js_song")
count=0
for everyLink in items1:
    myText=everyLink.get_text()
    print("everyLink : ","\n",everyLink)
    
    print("myText:","\n",myText)
    print("everyLink.attrs:","\n",everyLink.attrs)
    print(everyLink.attrs["href"])

    count+=1
    if count==1:
        break

print()

print("""class_=songlist__songname_txt""")
items2=soup.find_all(class_="songlist__songname_txt")

count=0
for everyLink in items2:
    myText=everyLink.get_text()
    print("everyLink : ","\n",everyLink)
    
    print("myText:","\n",myText)
    print("everyLink.attrs:","\n",everyLink.attrs)
    print(everyLink.attrs["class"])
    count+=1
    if count==1:
        break
'''    
    if "href" in everyLink.attrs:#属性attrs
            print(f"{myText}:href={everyLink.attrs['href']}",sep="\t")
            print(myText,":href=",everyLink.attrs['href'],sep="\t")


    注意，bs提取的信息，class很关键。筛选的东西，之后会形成一个字典。
    如果筛选的范围是链接范围，everyLink.attrs["href"]就会出现链接。
    如果筛选的范围是文本范文，就只能写成everyLink.attrs["class"]
'''


import requests
from bs4 import BeautifulSoup as bs

webURL="""
https://c.y.qq.com/soso/fcgi-bin/client_search_cp?
ct=24&qqmusic_ver=1298&
new_json=1&remoteplace=txt.yqq.song&
searchid=57068364391640558&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&
p=1
&n=10&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0
"""

webFile=requests.get(webURL)
webFile.encoding="utf-8"
data=webFile.text

jsonFile=webFile.json()
##print(type(jsonFile))#<class 'dict'>使用json()方法，将对象转为列表/字典
##for (k,v) in jsonFile.items():
##    print(k)

musicData=jsonFile["data"]#注意，文中是引号-字符串，那么得用引号，如果写成jsonFile[data]是没有用的
##print(type(musicData))
##for (k,v) in musicData.items():
##    print(k)


listMusic=musicData["song"]["list"]
print(type(listMusic))
for music in listMusic:
    print("播放专辑：",music["album"]["name"])
    print('播放时长：'+str(music['interval'])+'秒')  # 查找播放时长
    print('播放链接：https://y.qq.com/n/yqq/song/' +music['mid']+'.html\n\n')
    input()
##
##soup=bs(data,"html.parser")
##print(type(soup))#<class 'bs4.BeautifulSoup'>

import requests
from bs4 import BeautifulSoup as bs
import openpyxl

workBook=openpyxl.Workbook()
sheet1=workBook.active
sheet1.title="qq音乐链接表"



url="https://y.qq.com/n/yqq/singer/000FzZ3q3kxTMG.html"

webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")


sheet1.append(["footer_platform_list__item"])
Tag1=soup.find_all(class_="footer_platform_list__item")
for everyTag in Tag1:
    myText=everyTag.get_text()


    myLinks=everyTag.find_all("a")
    for i in myLinks:
        if "href" in i.attrs:
            myList1=[myText,i['href']]

            print(myList1)
            
            sheet1.append(myList1)


sheet1.append(["footer_link"])
Tag2=soup.find_all(class_="footer_link")
for everyTag in Tag2:
    myText=everyTag.get_text()
    myLinks=everyTag.find_all("a")
    for i in myLinks:
        if "href" in i.attrs:
            myList2=[myText,i["href"]]
            print(myList2)
            sheet1.append(myList2)
            
workBook.save("积累文档-QQ音乐网络链接.xlsx")

import requests
from bs4 import BeautifulSoup as bs
import openpyxl

workBook=openpyxl.Workbook()
sheet1=workBook.active
sheet1.title="qq音乐链接表"



url="https://y.qq.com/n/yqq/singer/000FzZ3q3kxTMG.html"

webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")


myClass=['footer_platform_list__item','footer_link','footer_download','footer_copyright','footer_platform',"footer_download"]

for everyClass in myClass:
    print(everyClass)
    
    sheet1.append([everyClass])
    Tag1=soup.find_all(class_=everyClass)

    for everyTag in Tag1:
        myText=everyTag.get_text()
        myLinks=everyTag.find_all("a")

        for i in myLinks:
            if "href" in i.attrs:
                myList1=[myText,i["href"]]
                print(myList1)

                sheet1.append(myList1)

workBook.save("积累文档-QQ音乐链接简练版.xlsx")

网站中的题目


#爬取网站上的题目
from bs4 import BeautifulSoup
import time
import requests


def Pachong():
    for pageNum in range(1,17):
        htmlFile=requests.get('http://vers.cqvip.com/view/course/subject/list.aspx?stid=ad29646905394d96a50c1818329fb4f6&cid=120&searchkey='+str(pageNum))
        htmlFile.encoding='utf-8'

        
        soup = BeautifulSoup(htmlFile.text,'html.parser')
        print(soup)
        input()
    #Pachong()


    htmlFile=requests.get('http://vers.cqvip.com/view/course/subject/list.aspx?stid=ad29646905394d96a50c1818329fb4f6&cid=120&searchkey=2')
    htmlFile.encoding='utf-8'
    print(htmlFile)


def PaTi():
    htmlfile=requests.get("http://vers.cqvip.com/view/course/chapter/detail.aspx?cid=125&chapter=%E6%98%8E%E4%BB%A3%E6%96%87%E5%AD%A6")
    htmlfile.encoding='utf-8'
    mysoup=BeautifulSoup(htmlfile.text,'html.parser')

    mycontent1=mysoup.prettify()
    print(mycontent1[:100])

    
    mycontent2=mysoup.smooth()
    print(mycontent2)
    print("OK")

    mycontent3=mysoup.select_one("div")
    print(mycontent3)
    print("Next")
    print()

    
    myinfor=mysoup.find("div").find_all("strong")
    print(myinfor)
    tmp=mysoup.find_next_sibling("div")
    print(tmp)
    
    
#class="q-box"
PaTi()

        
def Gupiao():

    htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
    htmlfile.encoding='utf-8'
    mysoup = BeautifulSoup(htmlfile.text,'html.parser')
    mycontent=mysoup.prettify()
    #print(type(mycontent))

    #输出字符串的前面信息，便于观察整个网站构成
    print(mycontent[:200])
    print()


    #寻找需要的信息，区分不同的语法
    def Find():        
        myinfor=mysoup.find_all("a")
        for i in myinfor:
            tmp=i.get_text()
            print(tmp)
            print(i)
            print()
            print(i.prettify())
        #print(myinfor)

            
    #将需要的网站输出
    def Wangzhan():
        urlsList=[]
        myinfor=mysoup.find_all("a")
        for line in myinfor:
            #print(line)
            
            tmp=str(line)#line的类型是<class 'bs4.element.Tag'>
            
            
            if "http" in tmp:
                url=tmp.split('"')#将长的字符串切分，留下网站
                urlsList.append(url[1])
                
                print(line.get_text())#获得网站的标题
                print(url[1])#输出网站字符串
            
    Wangzhan()
        
            
#Gupiao()

网站中的博客内容

知乎中的文章

import requests
import csv
#引用csv。
csv_file=open('articles.csv','w',newline='',encoding='utf-8')
#调用open()函数打开csv文件，传入参数：文件名“articles.csv”、写入模式“w”、newline=''。
writer = csv.writer(csv_file)
# 用csv.writer()函数创建一个writer对象。
list2=['标题','链接','摘要']
#创建一个列表
writer.writerow(list2)
#调用writer对象的writerow()方法，可以在csv文件里写入一行文字 “标题”和“链接”和"摘要"。

headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
url='https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?'
offset=0
#设置offset的起始值为0
while True:
    params={
        'include':'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
        'offset':str(offset),
        'limit':'20',
        'sort_by':'voteups',
        }
    #封装参数
    res=requests.get(url,headers=headers,params=params)
    #发送请求，并把响应内容赋值到变量res里面
    articles=res.json()
    print(articles)
    data=articles['data']
    #定位数据
    for i in data:
        list1=[i['title'],i['url'],i['excerpt']]
        #把目标数据封装成一个列表
        writer.writerow(list1)
        #调用writerow()方法，把列表list1的内容写入
    offset=offset+20
    #在while循环内部，offset的值每次增加20
    if offset > 40:
        break
csv_file.close()
#写入完成后，关闭文件就大功告成
print('okay')

import requests
from bs4 import BeautifulSoup as bs

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
    'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
    'limit': '20',
    'sort_by': 'created'
}

url2="https://www.zhihu.com/org/jing-ji-ri-bao-xin-wen-ke-hu-duan/posts"
webFile=requests.get(url2,params=params,headers=headers)
webFile.encoding="utf-8"
data=webFile.text

soup=bs(data,"html.parser")
preData=soup.prettify()



items2=soup.find_all(class_="item")

for iTag in items2:
    for i in iTag.find_all():
        print(i)

爬取博客

from urllib3 import *
from re import *
http=PoolManager()

#禁止显示警告信息
disable_warnings()

#下载url对应web页面
url="https://www.cnblogs.com/"
result=http.request("GET",url)
htmlStr=result.data.decode("utf-8")
print(htmlStr)


#分析html代码
#通过正则表达式，获取所有关于目标的信息
#<a class="post-item-title" href="https://www.cnblogs.com/hzoi-fengwu/p/14922218.html" target="_blank">STL----vector注意事项</a>

aList=findall('<a[^>] *post-item-title[^>]*>[^<]*</a>',htmlStr)
result=[]

#提取每一个<a后面的url

for a in aList:
    #利用正则表达式提取href后面的url
    g=search('href[\s]*=[\s]*[/"][\]',a)
    if g!=None:
        url=g.group(1)

        #得到url
        print(url)

爬取博客标题-爬虫-正则表达式部分

网站中的词典

#网络爬虫进阶urllib.request
def ilovefish():
    import urllib.request
    myResponse=urllib.request.urlopen("https://ilovefishc.com/")#打开网页，获取信息

    myHtml=myResponse.read()#读出数据
    #print(myHtml)
    myHtml=myHtml.decode("utf-8")#将二进制解码,按照网页信息<head> <meta charset="UTF-8">选择解码格式utf-8
    #print(myHtml)


def placekitten():
    #placekitten.com
    import urllib.request
    myResponse=urllib.request.urlopen("http://placekitten.com/500/600")#打开网页，获取信息

    my_cat_img=myResponse.read()#读出数据
    with open('cat_500_600.jpg','wb') as f:
        f.write(my_cat_img)

def myrequest():
    #urllib.request():This function always returns an object which can work as a context manager and has methods such as
    #geturl() — return the URL of the resource retrieved, commonly used to determine if a redirect was followed
    #info() — return the meta-information of the page, such as headers, in the form of an email.message_from_string() instance (see Quick Reference to HTTP Headers)
    #getcode() – return the HTTP status code of the response.


    import urllib.request
    myresponse=urllib.request.urlopen("http://placekitten.com/300/500")
    myurl=myresponse.geturl()
    print(myurl)

    print(myresponse.info())
    print(myresponse.getcode())

def Cidan():
    #小甲鱼将有道辞典功能提取出程序
    import urllib.request()
    url='http://fanyi.youdao.com/'
    data={}

    my_response=urllib.request.urlopen(url,data)