python 网络爬虫
爬虫基本操作
requests-请求 是否成功
积累程序-爬虫 -requests-请求 是否成功.
import requests
response=requests.get("https://www.pku.edu.cn")
print(response.status_code)#用来检查请求是否正确响应,如果状态码是200,代表请求成功。
#4XX,客户端错误,403,禁止访问。5XX,服务器错误,503,服务器不可用。3XX,重定向,305,应使用代理访问。1XX,请求收到。2XX,请求成功。
将文件写入本地
import requests
webFile=requests.get("https://www.pku.edu.cn/about.html")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现
print(data)
with open(r"E:/myDownload.html","w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
file1.write(data)
或者用如下代码
#【舉例】
import requests
webFile=requests.get("https://www.pku.edu.cn/about.html")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现
print(data)
with open(r"E:/myDownload.html","w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
file1.write(data)
#【舉例】
for i in range(a,a+3):#调试的时候将b换成a+1
webUrl="https://zh.m.wikisource.org/wiki/春秋左傳正義/卷"+str(i)
urlList.append(webUrl)
webFile=requests.get(webUrl)
webFile.encoding="utf-8"
data=webFile.text
myDfile="myDownload"+str(i)+".html"
#第一種
with open(myDfile,"w",encoding="utf-8") as file1:#将requests获得的网络文件写入本地中。
file1.write(data)
#第二種,用的是write,而不是print
wFile=open(myDfile,"w",encoding="utf-8")
wFile.write(data)
wFile.close()
观察网站结构
在爬取过程中,需要观察网站的结构。
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
</body>
</html>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html,'html.parser')
#print(soup)
#print(type(soup))#BeautifulSou
tag=soup.find('p')
#print(tag)#Tag
string1=tag.string
#print(string1)#NavigableString
soup2=BeautifulSoup("<b><!--Hey--></b>",'html.parser')
comment=soup2.b.string
#print(comment)
#print(type(comment))
soup3=BeautifulSoup('<ad id=123 class="red bule">Hey</ad>','html.parser')
tag=soup3.ad
##print(tag.name)#ad是自己定义的,命名的。
##print(tag.attrs)
##Tag对象类似于HTML文档的标签.
##
##对于标签来说,最重要的就是名字name和属性attrs.
#修改soup的信息
soup=BeautifulSoup('<p id=123 class="red bule">Hey</p>','html.parser')
tag=soup.p
tag.name='a'
tag.attrs['id']=456
tag.attrs['class'][0]='white'
#print(soup)
from bs4 import BeautifulSoup
soup=BeautifulSoup('<p>Hey</p>','html.parser')
tag=soup.p
##print(tag)
##string=tag.string
##print(string)
##print(type(string))
##
##print(string.split('e'))
##
##print(string.lower())
#NavigableString同样可以被直接修改,也可以使用repalce_with的方法来修改.
from bs4 import BeautifulSoup
soup=BeautifulSoup('<p>Hey</p>','html.parser')
tag=soup.p
a='Heloo'
tag.string=a
##print(soup)
##tag.string.replace_with('KO')
##print(soup)
html = """
<div>Total
<p class="story"> First_p
<a id="1">El</a>,
<a id="2">E2</a>,
<a id="3">E3</a>,
</p>
<p>Second_p</p>
</div>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html,'html.parser')
#print(soup)
tag=soup.p
#print(tag)
#首先,存在多个标签时,使用标签名称取到的永远是第一个该标签.其余返回的是none
##
##print(len(tag.contents))
##print(tag.contents)
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<div>
<p class="a title"><b>The Dormouse's story</b></p>
<p class="a story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
</div>
<div>
<p class="st">Last<p class="st">......</p></p>
</div>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
#标签就是HTML的标签.比如搜索html文档中的所有a标签.
#print(soup.find_all('a'))
#可以接受正则表达式作为过滤,比如所有名称中包含'a'的标签.
print()
import re
#print(soup.find_all(re.compile('a')))
##
##
##列表中所包含的元素都将作为过滤标准,比如搜索所有的a标签和b标签.
#print(soup.find_all(['a','b']))
#print()
#print(soup.find_all('p')[1].find_all(True))
##输出
##[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
## <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
## <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
def filter(tag):
return tag['id']=='link2'
#print(soup.find_all('p')[1])
##print(soup.find_all('p')[1].find_all(filter))
##
##print(soup.find_all('p')[1].find_all(filter))
#输出:
#[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
print(soup.select('.st'))
#BeautifulSoup提供了一个prettify()方法可以对不完整或者不规范的HTML文档进行规整.
改变后面的备注名。htm,或者txt,就可以改变格式。很神奇。
如何创建htm?网上可搜。创建一个网站,可以用网络打开。
电脑信息字段
有时候,需要一个电脑信息字段
import requests
import csv
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
'limit': '20',
'sort_by': 'created'
}
beautiful soup的使用
Help on class BeautifulSoup in module bs4:
class BeautifulSoup(bs4.element.Tag)
| BeautifulSoup(markup='', features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs)
|
| A data structure representing a parsed HTML or XML document.
|
| Most of the methods you'll call on a BeautifulSoup object are inherited from
| PageElement or Tag.
|
| Internally, this class defines the basic interface called by the
| tree builders when converting an HTML/XML document into a data
| structure. The interface abstracts away the differences between
| parsers. To write a new tree builder, you'll need to understand
| these methods as a whole.
|
| These methods will be called by the BeautifulSoup constructor:
| * reset()
| * feed(markup)
|
| The tree builder may call these methods from its feed() implementation:
| * handle_starttag(name, attrs) # See note about return value
| * handle_endtag(name)
| * handle_data(data) # Appends to the current data node
| * endData(containerClass) # Ends the current data node
|
| No matter how complicated the underlying parser is, you should be
| able to build a tree using 'start tag' events, 'end tag' events,
| 'data' events, and "done with data" events.
|
| If you encounter an empty-element tag (aka a self-closing tag,
| like HTML's <br> tag), call handle_starttag and then
| handle_endtag.
|
| Method resolution order:
| BeautifulSoup
| bs4.element.Tag
| bs4.element.PageElement
| builtins.object
|
| Methods defined here:
|
| __copy__(self)
| Copy a BeautifulSoup object by converting the document to a string and parsing it again.
|
| __getstate__(self)
|
| Constructor.
|
| :param markup: A string or a file-like object representing
| markup to be parsed.
|
| :param features: Desirable features of the parser to be
| used. This may be the name of a specific parser ("lxml",
| "lxml-xml", "html.parser", or "html5lib") or it may be the
| type of markup to be used ("html", "html5", "xml"). It's
| recommended that you name a specific parser, so that
| Beautiful Soup gives you the same results across platforms
| and virtual environments.
|
| :param builder: A TreeBuilder subclass to instantiate (or
| instance to use) instead of looking one up based on
| `features`. You only need to use this if you've implemented a
| custom TreeBuilder.
|
| :param parse_only: A SoupStrainer. Only parts of the document
| matching the SoupStrainer will be considered. This is useful
| when parsing part of a document that would otherwise be too
| large to fit into memory.
|
| :param from_encoding: A string indicating the encoding of the
| document to be parsed. Pass this in if Beautiful Soup is
| guessing wrongly about the document's encoding.
|
| :param exclude_encodings: A list of strings indicating
| encodings known to be wrong. Pass this in if you don't know
| the document's encoding but you know Beautiful Soup's guess is
| wrong.
|
| :param element_classes: A dictionary mapping BeautifulSoup
| classes like Tag and NavigableString, to other classes you'd
| like to be instantiated instead as the parse tree is
| built. This is useful for subclassing Tag or NavigableString
| to modify default behavior.
|
| :param kwargs: For backwards compatibility purposes, the
| constructor accepts certain keyword arguments used in
| Beautiful Soup 3. None of these arguments do anything in
| Beautiful Soup 4; they will result in a warning and then be
| ignored.
|
| Apart from this, any keyword arguments passed into the
| BeautifulSoup constructor are propagated to the TreeBuilder
| constructor. This makes it possible to configure a
| TreeBuilder by passing in arguments, not just by saying which
| one to use.
|
| decode(self, pretty_print=False, eventual_encoding='utf-8', formatter='minimal')
| Returns a string or Unicode representation of the parse tree
| as an HTML or XML document.
|
| :param pretty_print: If this is True, indentation will be used to
| make the document more readable.
| :param eventual_encoding: The encoding of the final document.
| If this is None, the document will be a Unicode string.
|
| endData(self, containerClass=None)
| Method called by the TreeBuilder when the end of a data segment
| occurs.
|
| handle_data(self, data)
| Called by the tree builder when a chunk of textual data is encountered.
|
| handle_endtag(self, name, nsprefix=None)
| Called by the tree builder when an ending tag is encountered.
|
| :param name: Name of the tag.
| :param nsprefix: Namespace prefix for the tag.
|
| handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, sourcepos=None)
| Called by the tree builder when a new tag is encountered.
|
| :param name: Name of the tag.
| :param nsprefix: Namespace prefix for the tag.
| :param attrs: A dictionary of attribute values.
| :param sourceline: The line number where this tag was found in its
| source document.
| :param sourcepos: The character position within `sourceline` where this
| tag was found.
|
| If this method returns None, the tag was rejected by an active
| SoupStrainer. You should proceed as if the tag had not occurred
| in the document. For instance, if this was a self-closing tag,
| don't call handle_endtag.
|
| insert_after(self, successor)
| This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
| it because there is nothing before or after it in the parse tree.
|
| insert_before(self, successor)
| This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
| it because there is nothing before or after it in the parse tree.
|
| new_string(self, s, subclass=None)
| Create a new NavigableString associated with this BeautifulSoup
| object.
|
| new_tag(self, name, namespace=None, nsprefix=None, attrs={}, sourceline=None, sourcepos=None, **kwattrs)
| Create a new Tag associated with this BeautifulSoup object.
|
| object_was_parsed(self, o, parent=None, most_recent_element=None)
| Method called by the TreeBuilder to integrate an object into the parse tree.
|
| popTag(self)
| Internal method called by _popToTag when a tag is closed.
|
| pushTag(self, tag)
| Internal method called by handle_starttag when a tag is opened.
|
| reset(self)
| Reset this object to a state as though it had never parsed any
| markup.
|
| ----------------------------------------------------------------------
| Data and other attributes defined here:
|
| ASCII_SPACES = ' \n\t\x0c\r'
|
| DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
| NO_PARSER_SPECIFIED_WARNING = 'No parser was explicitly specified, so ...
|
| ROOT_TAG_NAME = '[document]'
|
| ----------------------------------------------------------------------
| Methods inherited from bs4.element.Tag:
|
| __bool__(self)
| A tag is non-None even if it has no contents.
|
| __call__(self, *args, **kwargs)
| Calling a Tag like a function is the same as calling its
| find_all() method. Eg. tag('a') returns a list of all the A tags
| found within this tag.
|
| __contains__(self, x)
|
| __delitem__(self, key)
| Deleting tag[key] deletes all 'key' attributes for the tag.
|
| __eq__(self, other)
| Returns true iff this Tag has the same name, the same attributes,
| and the same contents (recursively) as `other`.
|
| __getattr__(self, tag)
| Calling tag.subtag is the same as calling tag.find(name="subtag")
|
| __getitem__(self, key)
| tag[key] returns the value of the 'key' attribute for the Tag,
| and throws an exception if it's not there.
|
| __hash__(self)
| Return hash(self).
|
| __iter__(self)
| Iterating over a Tag iterates over its contents.
|
| __len__(self)
| The length of a Tag is the length of its list of contents.
|
| __ne__(self, other)
| Returns true iff this Tag is not identical to `other`,
| as defined in __eq__.
|
| __repr__ = __unicode__(self)
|
| __setitem__(self, key, value)
| Setting tag[key] sets the value of the 'key' attribute for the
| tag.
|
| __str__ = __unicode__(self)
|
| __unicode__(self)
| Renders this PageElement as a Unicode string.
|
| childGenerator(self)
| Deprecated generator.
|
| clear(self, decompose=False)
| Wipe out all children of this PageElement by calling extract()
| on them.
|
| :param decompose: If this is True, decompose() (a more
| destructive method) will be called instead of extract().
|
| decode_contents(self, indent_level=None, eventual_encoding='utf-8', formatter='minimal')
| Renders the contents of this tag as a Unicode string.
|
| :param indent_level: Each line of the rendering will be
| indented this many spaces. Used internally in
| recursive calls while pretty-printing.
|
| :param eventual_encoding: The tag is destined to be
| encoded into this encoding. decode_contents() is _not_
| responsible for performing that encoding. This information
| is passed in so that it can be substituted in if the
| document contains a <META> tag that mentions the document's
| encoding.
|
| :param formatter: A Formatter object, or a string naming one of
| the standard Formatters.
|
| decompose(self)
| Recursively destroys this PageElement and its children.
|
| This element will be removed from the tree and wiped out; so
| will everything beneath it.
|
| encode(self, encoding='utf-8', indent_level=None, formatter='minimal', errors='xmlcharrefreplace')
| Render a bytestring representation of this PageElement and its
| contents.
|
| :param encoding: The destination encoding.
| :param indent_level: Each line of the rendering will be
| indented this many spaces. Used internally in
| recursive calls while pretty-printing.
| :param formatter: A Formatter object, or a string naming one of
| the standard formatters.
| :param errors: An error handling strategy such as
| 'xmlcharrefreplace'. This value is passed along into
| encode() and its value should be one of the constants
| defined by Python.
| :return: A bytestring.
|
| encode_contents(self, indent_level=None, encoding='utf-8', formatter='minimal')
| Renders the contents of this PageElement as a bytestring.
|
| :param indent_level: Each line of the rendering will be
| indented this many spaces. Used internally in
| recursive calls while pretty-printing.
|
| :param eventual_encoding: The bytestring will be in this encoding.
|
| :param formatter: A Formatter object, or a string naming one of
| the standard Formatters.
|
| :return: A bytestring.
|
| find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
| Look in the children of this PageElement and find the first
| PageElement that matches the given criteria.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param recursive: If this is True, find() will perform a
| recursive search of this PageElement's children. Otherwise,
| only the direct children will be considered.
| :param limit: Stop looking after finding this many results.
| :kwargs: A dictionary of filters on attribute values.
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| findAll = find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
|
| findChild = find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
|
| findChildren = find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
|
| find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
| Look in the children of this PageElement and find all
| PageElements that match the given criteria.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param recursive: If this is True, find_all() will perform a
| recursive search of this PageElement's children. Otherwise,
| only the direct children will be considered.
| :param limit: Stop looking after finding this many results.
| :kwargs: A dictionary of filters on attribute values.
| :return: A ResultSet of PageElements.
| :rtype: bs4.element.ResultSet
|
| get(self, key, default=None)
| Returns the value of the 'key' attribute for the tag, or
| the value given for 'default' if it doesn't have that
| attribute.
|
|
| get_attribute_list(self, key, default=None)
| The same as get(), but always returns a list.
|
| :param key: The attribute to look for.
| :param default: Use this value if the attribute is not present
| on this PageElement.
| :return: A list of values, probably containing only a single
| value.
|
| Get all child strings, concatenated using the given separator.
|
| :param separator: Strings will be concatenated using this separator.
|
| :param strip: If True, strings will be stripped before being
| concatenated.
|
| :types: A tuple of NavigableString subclasses. Any strings of
| a subclass not found in this list will be ignored. By
| default, this means only NavigableString and CData objects
| will be considered. So no comments, processing instructions,
| etc.
|
| :return: A string.
|
| has_attr(self, key)
| Does this PageElement have an attribute with the given name?
|
| has_key(self, key)
| Deprecated method. This was kind of misleading because has_key()
| (attributes) was different from __in__ (contents).
|
| has_key() is gone in Python 3, anyway.
|
| index(self, element)
| Find the index of a child by identity, not value.
|
| Avoids issues with tag.contents.index(element) getting the
| index of equal elements.
|
| :param element: Look for this PageElement in `self.contents`.
|
| prettify(self, encoding=None, formatter='minimal')
| Pretty-print this PageElement as a string.
|
| :param encoding: The eventual encoding of the string. If this is None,
| a Unicode string will be returned.
| :param formatter: A Formatter object, or a string naming one of
| the standard formatters.
| :return: A Unicode string (if encoding==None) or a bytestring
| (otherwise).
|
| recursiveChildGenerator(self)
| Deprecated generator.
|
| renderContents(self, encoding='utf-8', prettyPrint=False, indentLevel=0)
| Deprecated method for BS3 compatibility.
|
| select(self, selector, namespaces=None, limit=None, **kwargs)
| Perform a CSS selection operation on the current element.
|
| This uses the SoupSieve library.
|
| :param selector: A string containing a CSS selector.
|
| :param namespaces: A dictionary mapping namespace prefixes
| used in the CSS selector to namespace URIs. By default,
| Beautiful Soup will use the prefixes it encountered while
| parsing the document.
|
| :param limit: After finding this number of results, stop looking.
|
| :param kwargs: Keyword arguments to be passed into SoupSieve's
| soupsieve.select() method.
|
| :return: A ResultSet of PageElements.
| :rtype: bs4.element.ResultSet
|
| select_one(self, selector, namespaces=None, **kwargs)
| Perform a CSS selection operation on the current element.
|
| :param selector: A CSS selector.
|
| :param namespaces: A dictionary mapping namespace prefixes
| used in the CSS selector to namespace URIs. By default,
| Beautiful Soup will use the prefixes it encountered while
| parsing the document.
|
| :param kwargs: Keyword arguments to be passed into SoupSieve's
| soupsieve.select() method.
|
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| smooth(self)
| Smooth out this element's children by consolidating consecutive
| strings.
|
| This makes pretty-printed output look more natural following a
| lot of operations that modified the tree.
|
| ----------------------------------------------------------------------
| Readonly properties inherited from bs4.element.Tag:
|
| children
| Iterate over all direct children of this PageElement.
|
| :yield: A sequence of PageElements.
|
| descendants
| Iterate over all children of this PageElement in a
| breadth-first sequence.
|
| :yield: A sequence of PageElements.
|
| isSelfClosing
| Is this tag an empty-element tag? (aka a self-closing tag)
|
| A tag that has contents is never an empty-element tag.
|
| A tag that has no contents may or may not be an empty-element
| tag. It depends on the builder used to create the tag. If the
| builder has a designated list of empty-element tags, then only
| a tag whose name shows up in that list is considered an
| empty-element tag.
|
| If the builder has no designated list of empty-element tags,
| then any tag with no contents is an empty-element tag.
|
| is_empty_element
| Is this tag an empty-element tag? (aka a self-closing tag)
|
| A tag that has contents is never an empty-element tag.
|
| A tag that has no contents may or may not be an empty-element
| tag. It depends on the builder used to create the tag. If the
| builder has a designated list of empty-element tags, then only
| a tag whose name shows up in that list is considered an
| empty-element tag.
|
| If the builder has no designated list of empty-element tags,
| then any tag with no contents is an empty-element tag.
|
| strings
| Yield all strings of certain classes, possibly stripping them.
|
| :param strip: If True, all strings will be stripped before being
| yielded.
|
| :types: A tuple of NavigableString subclasses. Any strings of
| a subclass not found in this list will be ignored. By
| default, this means only NavigableString and CData objects
| will be considered. So no comments, processing instructions,
| etc.
|
| :yield: A sequence of strings.
|
| stripped_strings
| Yield all strings in the document, stripping them first.
|
| :yield: A sequence of stripped strings.
|
| text
| Get all child strings, concatenated using the given separator.
|
| :param separator: Strings will be concatenated using this separator.
|
| :param strip: If True, strings will be stripped before being
| concatenated.
|
| :types: A tuple of NavigableString subclasses. Any strings of
| a subclass not found in this list will be ignored. By
| default, this means only NavigableString and CData objects
| will be considered. So no comments, processing instructions,
| etc.
|
| :return: A string.
|
| ----------------------------------------------------------------------
| Data descriptors inherited from bs4.element.Tag:
|
| parserClass
|
| string
| Convenience property to get the single string within this
| PageElement.
|
| TODO It might make sense to have NavigableString.string return
| itself.
|
| :return: If this element has a single string child, return
| value is that string. If this element has one child tag,
| return value is the 'string' attribute of the child tag,
| recursively. If this element is itself a string, has no
| children, or has more than one child, return value is None.
|
| ----------------------------------------------------------------------
| Methods inherited from bs4.element.PageElement:
|
| append(self, tag)
| Appends the given PageElement to the contents of this one.
|
| :param tag: A PageElement.
|
| extend(self, tags)
| Appends the given PageElements to this one's contents.
|
| :param tags: A list of PageElements.
|
| extract(self)
| Destructively rips this element out of the tree.
|
| :return: `self`, no longer part of the tree.
|
| fetchNextSiblings = find_next_siblings(self, name=None, attrs={}, text=None, li
mit=None, **kwargs)
|
| fetchParents = find_parents(self, name=None, attrs={}, limit=None, **kwargs)
|
| fetchPrevious = find_all_previous(self, name=None, attrs={}, text=None, limit=N
one, **kwargs)
|
| fetchPreviousSiblings = find_previous_siblings(self, name=None, attrs={}, text=
None, limit=None, **kwargs)
|
| findAllNext = find_all_next(self, name=None, attrs={}, text=None, limit=None, *
*kwargs)
|
| findAllPrevious = find_all_previous(self, name=None, attrs={}, text=None, limit
=None, **kwargs)
|
| findNext = find_next(self, name=None, attrs={}, text=None, **kwargs)
|
| findNextSibling = find_next_sibling(self, name=None, attrs={}, text=None, **kwa
rgs)
|
| findNextSiblings = find_next_siblings(self, name=None, attrs={}, text=None, lim
it=None, **kwargs)
|
| findParent = find_parent(self, name=None, attrs={}, **kwargs)
|
| findParents = find_parents(self, name=None, attrs={}, limit=None, **kwargs)
|
| findPrevious = find_previous(self, name=None, attrs={}, text=None, **kwargs)
|
| findPreviousSibling = find_previous_sibling(self, name=None, attrs={}, text=Non
e, **kwargs)
|
| findPreviousSiblings = find_previous_siblings(self, name=None, attrs={}, text=N
one, limit=None, **kwargs)
|
| find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs)
| Find all PageElements that match the given criteria and appear
| later in the document than this PageElement.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param text: A filter for a NavigableString with specific text.
| :param limit: Stop looking after finding this many results.
| :kwargs: A dictionary of filters on attribute values.
| :return: A ResultSet containing PageElements.
|
| find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs)
| Look backwards in the document from this PageElement and find all
| PageElements that match the given criteria.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param text: A filter for a NavigableString with specific text.
| :param limit: Stop looking after finding this many results.
| :kwargs: A dictionary of filters on attribute values.
| :return: A ResultSet of PageElements.
| :rtype: bs4.element.ResultSet
|
| find_next(self, name=None, attrs={}, text=None, **kwargs)
| Find the first PageElement that matches the given criteria and
| appears later in the document than this PageElement.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param text: A filter for a NavigableString with specific text.
| :kwargs: A dictionary of filters on attribute values.
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| find_next_sibling(self, name=None, attrs={}, text=None, **kwargs)
| Find the closest sibling to this PageElement that matches the
| given criteria and appears later in the document.
|
| All find_* methods take a common set of arguments. See the
| online documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param text: A filter for a NavigableString with specific text.
| :kwargs: A dictionary of filters on attribute values.
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)
| Find all siblings of this PageElement that match the given criteria
| and appear later in the document.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param text: A filter for a NavigableString with specific text.
| :param limit: Stop looking after finding this many results.
| :kwargs: A dictionary of filters on attribute values.
| :return: A ResultSet of PageElements.
| :rtype: bs4.element.ResultSet
|
| find_parent(self, name=None, attrs={}, **kwargs)
| Find the closest parent of this PageElement that matches the given
| criteria.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :kwargs: A dictionary of filters on attribute values.
|
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| find_parents(self, name=None, attrs={}, limit=None, **kwargs)
| Find all parents of this PageElement that match the given criteria.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param limit: Stop looking after finding this many results.
| :kwargs: A dictionary of filters on attribute values.
|
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| find_previous(self, name=None, attrs={}, text=None, **kwargs)
| Look backwards in the document from this PageElement and find the
| first PageElement that matches the given criteria.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param text: A filter for a NavigableString with specific text.
| :kwargs: A dictionary of filters on attribute values.
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs)
| Returns the closest sibling to this PageElement that matches the
| given criteria and appears earlier in the document.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param text: A filter for a NavigableString with specific text.
| :kwargs: A dictionary of filters on attribute values.
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwar
gs)
| Returns all siblings to this PageElement that match the
| given criteria and appear earlier in the document.
|
| All find_* methods take a common set of arguments. See the online
| documentation for detailed explanations.
|
| :param name: A filter on tag name.
| :param attrs: A dictionary of filters on attribute values.
| :param text: A filter for a NavigableString with specific text.
| :param limit: Stop looking after finding this many results.
| :kwargs: A dictionary of filters on attribute values.
| :return: A ResultSet of PageElements.
| :rtype: bs4.element.ResultSet
|
| format_string(self, s, formatter)
| Format the given string using the given formatter.
|
| :param s: A string.
| :param formatter: A Formatter object, or a string naming one of the standar
d formatters.
|
| formatter_for_name(self, formatter)
| Look up or create a Formatter for the given identifier,
| if necessary.
|
| :param formatter: Can be a Formatter object (used as-is), a
| function (used as the entity substitution hook for an
| XMLFormatter or HTMLFormatter), or a string (used to look
| up an XMLFormatter or HTMLFormatter in the appropriate
| registry.
|
| insert(self, position, new_child)
| Insert a new PageElement in the list of this PageElement's children.
|
| This works the same way as `list.insert`.
|
| :param position: The numeric position that should be occupied
| in `self.children` by the new PageElement.
| :param new_child: A PageElement.
|
| nextGenerator(self)
| # Old non-property versions of the generators, for backwards
| # compatibility with BS3.
|
| nextSiblingGenerator(self)
|
| parentGenerator(self)
|
| previousGenerator(self)
|
| previousSiblingGenerator(self)
|
| replaceWith = replace_with(self, replace_with)
|
| replaceWithChildren = unwrap(self)
|
| replace_with(self, replace_with)
| Replace this PageElement with another one, keeping the rest of the
| tree the same.
|
| :param replace_with: A PageElement.
| :return: `self`, no longer part of the tree.
|
| replace_with_children = unwrap(self)
|
| setup(self, parent=None, previous_element=None, next_element=None, previous_sib
ling=None, next_sibling=None)
| Sets up the initial relations between this element and
| other elements.
|
| :param parent: The parent of this element.
|
| :param previous_element: The element parsed immediately before
| this one.
|
| :param next_element: The element parsed immediately before
| this one.
|
| :param previous_sibling: The most recently encountered element
| on the same level of the parse tree as this one.
|
| :param previous_sibling: The next element to be encountered
| on the same level of the parse tree as this one.
|
| unwrap(self)
| Replace this PageElement with its contents.
|
| :return: `self`, no longer part of the tree.
|
| wrap(self, wrap_inside)
| Wrap this PageElement inside another one.
|
| :param wrap_inside: A PageElement.
| :return: `wrap_inside`, occupying the position in the tree that used
| to be occupied by `self`, and with `self` inside it.
|
| ----------------------------------------------------------------------
| Readonly properties inherited from bs4.element.PageElement:
|
| next
| The PageElement, if any, that was parsed just after this one.
|
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| next_elements
| All PageElements that were parsed after this one.
|
| :yield: A sequence of PageElements.
|
| next_siblings
| All PageElements that are siblings of this one but were parsed
| later.
|
| :yield: A sequence of PageElements.
|
| parents
| All PageElements that are parents of this PageElement.
|
| :yield: A sequence of PageElements.
|
| previous
| The PageElement, if any, that was parsed just before this one.
|
| :return: A PageElement.
| :rtype: bs4.element.PageElement
|
| previous_elements
| All PageElements that were parsed before this one.
|
| :yield: A sequence of PageElements.
|
| previous_siblings
| All PageElements that are siblings of this one but were parsed
| earlier.
|
| :yield: A sequence of PageElements.
|
| ----------------------------------------------------------------------
| Data descriptors inherited from bs4.element.PageElement:
|
| __dict__
| dictionary for instance variables (if defined)
|
| __weakref__
| list of weak references to the object (if defined)
|
| nextSibling
|
| previousSibling
>>>
beautiful-find_all用法
积累程序-爬虫-beautiful-find_all用法
import requests
from bs4 import BeautifulSoup as bs
webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串
soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
##
##items=soup.find_all(class_="h")
##for i in items:
## print(i)
items2=soup.find_all(class_="item")
for iTag in items2:
for i in iTag.find_all():
print(i)
request-beautifulsoup区别
import requests
file1=requests.get("https://www.pku.edu.cn")
file1.encoding="utf-8"
data=file1.text
myFile=open(r"E:\pkuCode.txt","w",encoding="utf-8")
print(data,file=myFile)
myFile.close()
'''
soup的数据类型是<class 'bs4.BeautifulSoup'>,说明soup是一个BeautifulSoup对象
打印的soup,是所请求网页的完整HTML源代码
虽然response.text和soup打印出的内容表面上看长得一模一样,却有着不同的内心,它们属于不同的类:<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串,后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本,是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的str方法,所以直接打印 bs 对象显示字符串是str的返回结果
'''
获取网页中文字并有序展现
爬虫-bs-获取北大网页中的网站和文字-并有序展现
import requests
from bs4 import BeautifulSoup as bs
webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串
soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
items2=soup.find_all(class_="item")
##for iTag in items2:
## for i in iTag.find_all():
## myText=i.get_text()
## print(myText)
##
for everyTag in items2:
print(everyTag)
print()
print("文字部分")
myText=everyTag.get_text()
print(myText)
print()
print("链接部分")
myLinks=everyTag.find_all("a")#everyLink是BS 中的tag
for everyLink in myLinks:
if "href" in everyLink.attrs:#attrs只有在BS 中tag中才可以用。
print(everyLink)
input()
global
变量作用域
一个在函数内部赋值的变量仅能在该函数内部使用(局部作用域),它们被称作局部变量
在所有函数之外赋值的变量,可以在程序的任何位置使用(全局作用域),它们被称作全局变量
如果想将局部变量声明为全局变量,就要用到global语句
tfc = 1000
def tvc():
global tvc # global语句一般写在函数体的第一行,它会告诉Python,“我希望tvc是个全局变量,所以请不要用这个名字创建一个局部变量”
vc = 200
x = 10
tvc = vc * x
def tc():
print(tfc+tvc) # tc()函数内部现在可以直接使用声明后的全局变量tvc
tvc()
tc()
# 》》3000
match
import re
m=re.match("hello","hellov world")
if m is not None:
print(m.group())
print(m.__class__.__name__)
m=re.match("bird","bird is flying")
print(m.group())
使用soup.prettify() 有序呈现
import requests
import csv
from bs4 import BeautifulSoup as bs
url="https://www.zhihu.com/follow"
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
'limit': '20',
'sort_by': 'created'
}
webFile= requests.get(url, params=params, headers=headers)
webFile.encoding="utf-8"
data=webFile.text
soup=bs(data,"html.parser")
print(soup.prettify())
爬取标签
从网页中爬取标签
从超星、维基、知网、阿帕比网站,Ctrl + S 保存网页后,爬取其中的文本目录信息。可以用如下代码实现操作。
myWord="""
[Images]
[Font]
Language=GBK
FontSize=7
Margin=0.5
[Bkmk]
File=FreePic2Pdf_bkmk.txt
AddAsText=0
ShowBkmk=1
ShowAll=1
BasePage=1
[Main]
ContentsPage=
TextPage=
"""
Head='''
首
\t书名页
\t版权页
\t序言
目录
'''
def test():
htmlName=str(input("请输入网页Wiki CNKI ChoaXing Apabi文件名称:"))
import requests
from bs4 import BeautifulSoup as bs
webFile=open(htmlName,"r",encoding="utf-8")
data=webFile.read()
webFile.close()
mysoup=bs(data,"html.parser")
mysoup.prettify()
writeFile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
print(Head,file=writeFile)
if "维基文库" in htmlName:
print("Wiki")
result=mysoup.find_all("li")
choice=input("请选择通行A 或 调试T:")
for i in result:
myInfo=i.get_text()
if choice=="A":
if "卷" in myInfo:
mylist=myInfo.split(" ")
print(mylist[0],file=writeFile)
for m in mylist[1:]:
print("\t",m,file=writeFile)
elif choice=="T":
if "卷" in myInfo:
print(myInfo,file=writeFile)
else:
print("\t",myInfo,file=writeFile)
elif "阿帕比" in htmlName:
print("Apabi")
result=mysoup.find_all("li")
for i in result:
myInfo=i.get_text()
for word in "()1234567890页":
myInfo=myInfo.replace(word,"")
infoList=myInfo.split(" ")
if len(infoList)>2:#将单个的对象排除。统一切分处理
print(infoList[1],file=writeFile)
for m in infoList[2:]:
print("\t",m,file=writeFile)
elif len(infoList)==2:
print("\t",myInfo,file=writeFile)
elif "中国知网" in htmlName or "CNKI" in htmlName:
print("CNKI")
result=mysoup.find_all(attrs={"class":"catalog-listDiv"})
if len(result)==0:
result=mysoup.find_all("li")
for i in result:
myInfo=i.get_text()
infoline=myInfo.split(" ")
for line in infoline:
if "摘要" in line:
nline=line.split(" ")
for m in nline:
print(m,file=writeFile)
elif "第" in line and "章" in line and "节" not in line:
wline=line.split(" ")
print("\t",wline[0],file=writeFile)
for m in wline[1:]:
print(m,end="",file=writeFile)
print("\n",file=writeFile)
elif "结语 参考文献 致谢" in line:
nline=line.split(" ")
print(nline[0]+nline[1],file=writeFile)
for m in nline[2:]:
print(m,file=writeFile)
else:print("\t",line,file=writeFile)
else:
print("ChaoXing")
result=mysoup.find_all("span")
for i in result:
if "node_name" in str(i):
sen=i.get_text()
sen=sen.lstrip(" ")
if "第" in str(i) and "章" in str(i):
print(sen,file=writeFile)
elif "第" in str(i) and "讲" in str(i):
print(sen,file=writeFile)
elif "卷" in str(i) or "论" in str(i) or "编" in str(i):
for hz in "一二三四五六七八九十":
if hz in str(i):
print(sen,file=writeFile)
break
else:print("\t",sen,file=writeFile)
else:
print("\t",sen,file=writeFile)
print("尾",file=writeFile)
writeFile.close()
itfFile=open("FreePic2Pdf.itf","w",encoding="utf-8")
print(myWord,file=itfFile)
itfFile.close()
即可完成。
从文本中获取标签
可以使用如下代码:
def test():
import re
pattern="“.*?[。?:;”]"
fileName=input("选择句子开头作为标签,请输入文本名称:")#说文解字,尔雅
part=input("请输入1或2个区分层级关键词{第部章卷...}:")
if len(part)==1:
a=part
b=part
elif len(part)==2:
a=part[0]
b=part[1]
choice="L"
choice=input("文本对话选L;Wiki目录选W;开头首字母选S;开头前面句子选E:")
choice=choice.upper()
file=open(fileName,"r",encoding="utf-8")
data=file.read()
file.close()
data=data.replace("编辑","")
datalines=data.splitlines()
def ShuoWen():
#说文
for line in datalines:
for word in line:
if word in "(( )0123456789:↑":
break
print("\t",word,file=wfile)
def ErYa():
for line in datalines:
if part in line:
print(line,file=wfile)
else:print("\t",line[:5],file=wfile)
def Wiki():
for line in datalines:
if part in line and len(line)<=4 and len(line)>=2:
print(line,file=wfile)
elif "↑" in line or "◄" in line or "►" in line or " 註釋" in line:pass
elif len(line)>=2 and len(line)<=10:
print("\t",line,file=wfile)
def LunYu():
zhang=0
jieming=0
for line in datalines:
if a in line and b in line:
print(line,file=wfile)
zhang+=1
jieming=1
if a not in line and b not in line and len(line)>4:#【经验】if ...if...和if ... else...不同。前者是单线,后者是双线。
result=re.compile(pattern).findall(line)
print("\t",f"{zhang}.{jieming}",end="",file=wfile)
if len(result)!=0:#选择引号内的句子。
jieming+=1
n=0
for i in result:
i=i.lstrip("“")
print(i,file=wfile)
n+=1
if n==1:
break
else:#没有引号则选择开头句子
jieming+=1
for w in line:
print(w,end="",file=wfile)
if w in ":。;":
break
print("\n",file=wfile)
wfile=open("FreePic2Pdf_bkmk.txt","w",encoding="utf-8")
if choice=="S":
ShuoWen()
elif choice=="E":
ErYa()
elif choice=="W":
Wiki()
elif choice=="L":
LunYu()
wfile.close()
print("已经完成")
即可实现。
爬取文本
爬虫实践从wiki中下载文本
def test():
import requests
from bs4 import BeautifulSoup as bs
import time
import random
import re
webUrl=input("请输入书籍所在的维基网址:")
infoList=webUrl.split("/")
articleName=infoList[-1]
startTime=time.time()
writeFile=open(f"{articleName}.txt","a",encoding="utf-8")
webFile=requests.get(webUrl)
webFile.encoding="utf-8"
data=webFile.text
obs=bs(data,"html.parser")
obs.prettify()
resultLink=obs.find_all("li")
webList=[]
for link in resultLink:
if articleName in str(link):
iname=link.get_text()
iweb=webUrl+"/"+iname
webList.append(iweb)
for iweb in webList:
print(iweb)
iFile=requests.get(iweb)
iFile.encoding="utf-8"
idata=iFile.text
iobs=bs(idata,"html.parser")
iobs.prettify()
result0=iobs.find_all(attrs={"class":"section-heading"})
## result1=iobs.find_all("section")
## print(result1)
result1=iobs.find_all(attrs={"class":"mw-parser-output"})
## for i in result1:
## print(i.get_text(),file=writeFile)
##
if len(result0)!=0:
result1.pop(0)#如果开头标题有多余信息,则使用这个软件
xy=zip(result0,result1)
for i in xy:
print(i[0].get_text()[:-2],file=writeFile)#下载《春秋左传正义》的时候用了这个程序
print(i[1].get_text(),file=writeFile)
else:
for i in result1:
print(i.get_text(),file=writeFile)#下载《史记三家注》用了这个程序
time.sleep(0.05+random.randint(0,2))
writeFile.close()
endTime=time.time()
long=(endTime-startTime)/60
print("总记时:","{0:4.2}".format(long),"分钟。")
对于代码,还可以进一步优化。
def test():
import requests
from bs4 import BeautifulSoup as bs
import time
import random
import re
webUrl=input("请输入书籍所在的维基网址:")
infoList=webUrl.split("/")
articleName=infoList[-1]
startTime=time.time()
writeFile=open(f"{articleName}.txt","a",encoding="utf-8")
webFile=requests.get(webUrl)
webFile.encoding="utf-8"
data=webFile.text
obs=bs(data,"html.parser")
obs.prettify()
resultLink=obs.find_all("li")
webList=[]#需要依据实际情况调整章节的网络链接格式
for link in resultLink:
if articleName in str(link):
iname=link.get_text()
iweb=webUrl+"/"+iname
webList.append(iweb)#有的网站是“卷01”,不按照链接体现的格式。这个就得调整程序了。
for iweb in webList:
print(iweb)
iFile=requests.get(iweb)
iFile.encoding="utf-8"
idata=iFile.text
iobs=bs(idata,"html.parser")
iobs.prettify()
result0=iobs.find_all(attrs={"class":"section-heading"})
## result1=iobs.find_all("section")
## print(result1)
result1=iobs.find_all(attrs={"class":"mw-parser-output"})
## for i in result1:
## print(i.get_text(),file=writeFile)
##
if len(result0)!=0:
result1.pop(0)#如果开头标题有多余信息,则使用这个软件
xy=zip(result0,result1)
for i in xy:
print(i[0].get_text()[:-2],file=writeFile)#下载《春秋左传正义》的时候用了这个程序
print(i[1].get_text(),file=writeFile)
else:
for i in result1:
print(i.get_text(),file=writeFile)#下载《史记三家注》用了这个程序
time.sleep(0.05+random.randint(0,2))
writeFile.close()
endTime=time.time()
long=(endTime-startTime)/60
print("总记时:","{0:4.2}".format(long),"分钟。")
test()
爬虫实践从zdic中下载文本
import requests
from bs4 import BeautifulSoup as bs
import time
import random
import re
def test():
a=int(input("请输入汉典网页起始页码:"))
b=int(input("请输入汉典网页终止页码:"))
myName=input("请输入目标文件名:")
startTime=time.time()
HouZhui=".docx"
resultName=myName+HouZhui
urlList=[]
for i in range(a,b+1):
webUrl="https://gj.zdic.net/archive.php?aid-"+str(i)+".html"
urlList.append(webUrl)
zongShu=len(urlList)
n=1
writeFile=open(resultName,"w",encoding="utf-8")
for webUrl in urlList:
webfile=requests.get(webUrl)
webfile.encoding="utf-8"
data=webfile.text
obs=bs(data,"html.parser")
obs.prettify()
title=obs.title
for i in title:
print("\n",file=writeFile)
print(i,file=writeFile)
print("★",file=writeFile)
result=obs.find_all(attrs={"id":"snr2"})
art=str(result)
artlines=art.splitlines()
article=artlines[0][17:]
article=article.replace("<br/>","s")
for i in article:
if i=="s":
print("\n",file=writeFile)
print("\t",file=writeFile)
else:print(i,end="",sep="",file=writeFile)
print("……",file=writeFile)
print("\n",file=writeFile)
time.sleep(0.05+random.randint(0,2))
percent=float(n/zongShu)
print(f"第{n}页已完成,共计{zongShu}页,完成度","{0:4.2}".format(percent))
n+=1
writeFile.close()
endTime=time.time()
long=(endTime-startTime)/60
print("总记时:","{0:4.2}".format(long),"分钟。")
爬虫实践 从ctext 中下载文本
从ctext 中下载文本。可以用到ctext包。
https://pypi.org/project/ctext/
下面以《论语》为例,说明如何下载。
代码如下:
from ctext import *
setapikey("your-api-key-goes-here")
setlanguage("zh")
stats = getstats()
status = getstatus()
titles = gettexttitles()
capabilities = getcapabilities()
urn = readlink("https://ctext.org/analects")#以论语为例
passages = gettext("ctp:analects/xue-er")
print(passages)
又有如下程序,亦可以实现功能。
def test():
'''
https://ctext.org/wiki.pl?if=gb&chapter=868712
https://ctext.org/wiki.pl?if=gb&chapter=969206
webUrl="https://ctext.org/wiki.pl?if=gb&res=970278
'''
import requests
from bs4 import BeautifulSoup as bs
import time
import random
headers={}#建立字典
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]
startTime=time.time()
webUrl=input("请输入书本在Ctext中目录所在网址:")#目录页所在编码,可以获得每章的链接
##webUrl="https://ctext.org/wiki.pl?if=gb&res=642006"
startPage=int(input("请输入目录列表中所求链接的序列数字:"))
webfile=requests.get(webUrl)
webfile.encoding="utf-8"
data=webfile.text
obs=bs(data,"html.parser")
obs.prettify()
result=obs.find_all("a")
Name=obs.h2
nameStr=Name.get_text()
nameList=nameStr.split("[")
resultName=nameList[0]
urlList=[]
for i in result:
if "wiki.pl?" and "卷" in str(i):
url=list(i.attrs.values())
webLink="https://ctext.org/"+url[0]
urlList.append(webLink)
elif "wiki.pl?" and "序" in str(i):
url=list(i.attrs.values())
webLink="https://ctext.org/"+url[0]
urlList.append(webLink)
numList=[str(i) for i in range(0,10)]
zongShu=len(urlList)
n=0
writeFile=open(f"{resultName}_FromCtext.txt","a+",encoding="utf-8")
start=startPage-1
for webUrl in urlList[start:]:#列表从0开始
headers['User-Agent']= random.choice(user_agent_list)
print(webUrl)
webfile=requests.get(webUrl,headers=headers)
webfile.encoding="utf-8"
data=webfile.text
obs=bs(data,"html.parser")
obs.prettify()
title=obs.title
for i in title:
print(i,file=writeFile)
print("★",file=writeFile)
result=obs.find_all(class_="ctext")
for i in result:
myStr=i.get_text()
for num in numList:
myStr=myStr.replace(num,"")
print(myStr,file=writeFile)
n+=1
time.sleep(3+random.randint(0,3))
percent=float((n+start)/zongShu)
print(f"第{n+start}页已完成,共计{zongShu}页,完成度","{0:4.2}".format(percent))
endTime=time.time()
long=(endTime-startTime)/60
print("总记时:","{0:4.2}".format(long),"分钟。")
writeFile.close()
三者结合
想要使得三者结合,有如下代码:
'''
https://gj.zdic.net/archive.php?aid-6679.html
'''
def test():
webChoice=input("汉典:Z;维基:W;哲学电子书:C。请输入选择:")
webChoice=webChoice.upper()
if webChoice=="Z":
import BsFromZdic
BsFromZdic.test()
elif webChoice=="W":
import BsFromWik
BsFromWik.test()
elif webChoice=="C":
import BsFromCtext
BsFromCtext.test()
爬取图片
贴吧中的动画图片
《虹猫蓝兔七侠传》是一部非常不错的动画片,后续还有漫画版的前传和后传。百度贴吧中,有这样一系列图片,现在想把图片爬下来,合成PDF便于阅读。写如下代码:
'''
<img class="BDE_Image" src="https://imgsa.baidu.com/forum/w%3D580/sign=fbff
fefc1f950a7b75354ecc3ad3625c/4c5fa44bd11373f035f5ca55a60f4bfbf9ed04ca.jpg" pic_ext="jpeg" pic_type="0"
width="560" height="388">
<img class="BDE_Image" src="https://imgsa.baidu.com/forum/w%3D580/sign=efda23249e82d158bb8259b9b00
819d5/acb1c2ef76094b36927cbe27a1cc7cd98c109d2e.jpg"
pic_ext="jpeg" pic_type="0" width="560" height="426">
<img class="image_original_original"
style="z-index: 2; width: 585.176px; height: 450px; top: 0px; left: 75.9122px;"
src="http://imgsrc.baidu.com/forum/pic/item/f82405d7912397dd928f3cce5b82b2b7d1a28726.jpg">
'''
urlList=["http://tieba.baidu.com/p/3175345087",
"http://tieba.baidu.com/p/3175362317",
"http://tieba.baidu.com/p/3175373350",
"http://tieba.baidu.com/p/3175383386",
"http://tieba.baidu.com/p/3175393635",
"http://tieba.baidu.com/p/3175402697",]
import urllib.request
import re
zhang=1
for webUrl in urlList:
i=1
htmll=urllib.request.urlopen(webUrl).read()
data=str(htmll)
pattern='''img class="image_original_original" src=.(.+?\.jpg)"'''
result=re.compile(pattern).findall(data)
for imageUrl in result:
print(imageUrl)
## print(imageUrl)
## imageName=str(zhang)+"-"+str(i)+".jpg"
## i=i+1
## urllib.request.urlretrieve(imageUrl,filename=imageName)
## zhang=zhang+1
##
print()
注意:爬取后的图片,有模糊图和高清图两种,名称并不一样。 高清图的位置不在页面中而是在别处,需要修正一下网络链接才能爬取。
那么,能否从中找到共性,写成通用的代码呢?于是做了如下尝试:
def test():
'''
http://imgsrc.baidu.com/forum/pic/item/e69597510fb30f24ebcb4ec9ca95d143ac4b0347.jpg
http://imgsrc.baidu.com/forum/pic/item/4c0f7af082025aaf165fdc01f9edab64024f1aa3.jpg
'''
import urllib.request
import requests
from bs4 import BeautifulSoup as bs
import re
print("每个网站的情况并不一致,借鉴此程序后,重新写代码为宜。")
mychoice=input("是否继续 Y or N:")
if mychoice=="Y":
pass
else:
exit()
print("如为避免遗漏而需下载网页,请复制网页代码到web.html并输入D。")
print("如在网上运行,请输入W。")
choice=input("Download or Web:")
webUrlList=[]
while True:
webUrl=input("请输入要下载图片所在的完整网站:")
webUrlList.append(webUrl)
webChoice=input("是否继续输入网站,Y or N:")
if webChoice=="N":
break
##webUrl="https://baike.baidu.com/pic/黑小虎传奇/4659511"#点击进入百度黑小虎传奇图册。
adjust=input("是否需要调整高清图,Y or N:")
classImage=str(input("请输入obs寻找到的class类别:"))
pattern='src="..*?"'
zhang=1
if choice=="D" and adjust=="N":
myfile=open("web.html","r",encoding="utf-8")
data=myfile.read()
myfile.close()
obs=bs(data,"html.parser")
result=obs.find_all(attrs={"class":classImage})
n=1
for i in result:
myLink=re.findall(pattern,str(i))
bLink=str(myLink[0])
print(bLink)
imageName="图"+str(n)+".jpg"
urllib.request.urlretrieve(bLink,filename=imageName)
n+=1
zhang+=1
elif choice=="D" and adjust=="Y":
addLink="watermark,image_d2F0ZXIvYmFpa2UxODA=,g_7,xp_5,yp_5/format,f_auto"
myfile=open("web.html","r",encoding="utf-8")
data=myfile.read()
myfile.close()
obs=bs(data,"html.parser")
result=obs.find_all("img")
n=1
for i in result:
try:
## print(i)
myLink=re.findall(pattern,str(i))
aLink=myLink[0]
aList=aLink.split("/")
aLink=aList[2][:-1]#需要依据实际情况不断调整。
## print(aList)
bLink=f"https://bkimg.cdn.bcebos.com/pic/{aLink}?x-bce-process=image/{addLink}"
#### bLink=aList[-1]#通过观察,找到更为清晰的图片链接。
print(bLink)
imageName="图"+str(n)+".jpg"
urllib.request.urlretrieve(bLink,filename=imageName)
n+=1
except:pass
zhang+=1
elif choice=="W" and adjust=="Y":
addLink=input("请根据情况输入图片网址的前半部分:")
#"http://imgsrc.baidu.com/forum/pic/item/"
for webUrl in webUrlList:
html=requests.get(webUrl)
html.encoding="utf-8"
data=html.text
obs=bs(data,"html.parser")
obs.prettify()
result=obs.find_all(attrs={"class":classImage})
n=1
for i in result:
print(i)
myLink=re.findall(pattern,str(i))#bs是用find_all,而re使用findall
print(myLink)
aLink=myLink[0]
aList=aLink.split("/")
bLink=addLink+aList[-1]#通过观察,找到更为清晰的图片链接。
print(bLink)
imageName=str(zhang)+"图"+str(n)+".jpg"
urllib.request.urlretrieve(bLink,filename=imageName)
n+=1
zhang+=1
elif choice=="W" and adjust=="N":
zhang=1
for webUrl in webUrlList:
html=requests.get(webUrl)
html.encoding="utf-8"
data=html.text
obs=bs(data,"html.parser")
obs.prettify()
result=obs.find_all(attrs={"class":classImage})
n=1
for i in result:
myLink=re.findall(pattern,str(i))
bLink=str(myLink[0])
print(bLink)
imageName="图"+str(n)+".jpg"
urllib.request.urlretrieve(bLink,filename=imageName)
n+=1
zhang+=1
else:print("未能导出图片,请进一步完善程序。")
网站中的地理图片
国家地理网站中,有一些图片也可以进行爬取。
# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"
def inputFile():
f=open("nationalgeographic.htm","r",encoding="utf-8")
ls=f.readlines()
f.close()
print(ls)
#<img alt="火山口" src="http://image.ngchina.com.cn/2019/1104/20191104100458321.jpg">
urls = []
for line in ls:
if 'img' in line:
url = line.split('src=')[-1].split('"')[1] # 这里得研究一下
if 'http' in url:
urls.append(url)
#print(urls)
return urls
#inputFile()
def showResults():
urls=inputFile()
f=open("result.txt","w",encoding="utf-8")
count = 0
for url in urls:
print("第{}个URL:{}".format(count, url),file=f)
print("第{}个URL:{}".format(count, url))
count += 1
f.close()
showResults()
注意,爬取时候要获得图片的网页链接
# inputfile='nationalgeograph.html'
# outputfile="nationalgeograph-urls.txt"
def inputFile():
f=open("file1.htm","r",encoding="utf-8")
ls=f.readlines()
f.close()
print(ls)
urls = []
for line in ls:
if 'img' in line:
url = line.split('src=')[-1].split('"')[1] # 这里得研究一下
if 'http' in url:
urls.append(url)
#print(urls)
return urls
#inputFile()
def showResults():
urls=inputFile()
f=open("result.txt","w",encoding="utf-8")
count = 0
for url in urls:
print("第{}个URL:{}".format(count, url),file=f)
print("第{}个URL:{}".format(count, url))
count += 1
f.close()
showResults()
网站中的人物图片
在学习爬虫的时候,借鉴了网友的代码,如下所示,其中在运行过程中,发现会有bug需要修正。在一步一步运行代码并修正的时候,也对爬虫有了更深入的了解。
import requests
url="http://www.runoob.com"
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
#设置headers,网站会根据这个判断你的浏览器及操作系统,很多网站没有此信息将拒绝你访问
#用get方法打开url并发送headers
html = requests.get(url,headers = header)
#print(html.text)
#提取所需要的信息
##将获取的源码转换为BeautifulSoup对象
##使用find搜索需要的数据,保存到容器中
from bs4 import BeautifulSoup
url='http://www.mzitu.com'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
html=requests.get(url,headers=header)
#print(html.text)
#使用自带的html.parser解析,速度慢但通用
soup = BeautifulSoup(html.text,'html.parser')
#寻找div中的所有a
all_a=soup.find("div",class_="postlist").find_all("a",target="_blank")
##for a in all_a:
## title=a.get_text()#提取文本
## print(title)
##
##
##all_div=soup.find("div",class_="postlist")
##for i in all_div:
## tmp=i.get_text()
## print(tmp)
#find 返回类型和find_all返回类型不同,find_all才能用get_text()
##page = soup.find_all('a', class_='page-numbers')
##max_page = page[-2].text
###print(max_page)
picture=soup.find("div",class_='postlist').find_all("a",target="_blank")
for everylink in picture:
#print(everylink)
tmp=everylink.attrs
#print(tmp)
mytxt=everylink.get_text()
if "href" in everylink.attrs:
print(f"href={everylink.attrs['href']}",sep="\t")
#print(picture)
# same_url = 'http://www.mzitu.com/page/' # 主页默认最新图片
# 获取每一类MM的网址
##same_url = 'https://www.mzitu.com/mm/page/'
##
##
##for n in range(1, int(max_page) + 1):
## ul = same_url + str(n)
## #print(ul)
## # 分别对当前类每一页第一层url发起请求
## start_html = requests.get(ul, headers=header)
## # 提取所有MM的标题
## soup = BeautifulSoup(start_html.text, "html.parser")
## all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
## #print(all_a)
##
## # 遍历所有MM的标题
## for a in all_a:
## # 提取标题文本,作为文件夹名称
## title = a.get_text()
## print(title)
## if(title != ''):
## print("准备扒取:" + title)
## if(oa.path.exists(path+title.strip()))
##
##
##
于是,经过试错,不断修正,完善为如下代码:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
all_url = 'https://www.mzitu.com'
# http请求头
Hostreferer = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer': 'http://www.mzitu.com'
}
# 此请求头Referer破解盗图链接
Picreferer = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer': 'http://i.meizitu.net'
}
# 对mzitu主页all_url发起请求,将返回的HTML数据保存,便于解析
start_html = requests.get(all_url, headers=Hostreferer)
# Linux保存地址
# path = '/home/Nick/Desktop/mzitu/'
# Windows保存地址
path = 'E:/mzitu/'
# 获取最大页数
soup = BeautifulSoup(start_html.text, "html.parser")
page = soup.find_all('a', class_='page-numbers')
max_page = page[-2].text
# same_url = 'http://www.mzitu.com/page/' # 主页默认最新图片
# 获取每一类MM的网址
same_url = 'https://www.mzitu.com/mm/page/' # 也可以指定《qingchun MM系列》
for n in range(1, int(max_page) + 1):
# 拼接当前类MM的所有url
ul = same_url + str(n)
# 分别对当前类每一页第一层url发起请求
start_html = requests.get(ul, headers=Hostreferer)
# 提取所有MM的标题
soup = BeautifulSoup(start_html.text, "html.parser")
all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
# 遍历所有MM的标题
for a in all_a:
# 提取标题文本,作为文件夹名称
title = a.get_text()
if(title != ''):
print("准备扒取:" + title)
# windows不能创建带?的目录,添加判断逻辑
if(os.path.exists(path + title.strip().replace('?', ''))):
# print('目录已存在')
flag = 1
else:
os.makedirs(path + title.strip().replace('?', ''))
flag = 0
# 切换到上一步创建的目录
os.chdir(path + title.strip().replace('?', ''))
# 提取第一层每一个MM的url,并发起请求
href = a['href']
html = requests.get(href, headers=Hostreferer)
mess = BeautifulSoup(html.text, "html.parser")
# 获取第二层最大页数
pic_max = mess.find_all('span')
pic_max = pic_max[9].text
if(flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max)):
print('已经保存完毕,跳过')
continue
# 遍历第二层每张图片的url
for num in range(1, int(pic_max) + 1):
# 拼接每张图片的url
pic = href + '/' + str(num)
# 发起请求
html = requests.get(pic, headers=Hostreferer)
mess = BeautifulSoup(html.text, "html.parser")
pic_url = mess.find('img', alt=title)
print(pic_url['src'])
html = requests.get(pic_url['src'], headers=Picreferer)
# 提取图片名字
file_name = pic_url['src'].split(r'/')[-1]
# 保存图片
f = open(file_name, 'wb')
f.write(html.content)
f.close()
print('完成')
print('第', n, '页完成')
个人图书馆中的学习图片
在360doc中,有些图片很利于学习,如何爬取呢?
写如下代码:
import urllib.request
import requests
from bs4 import BeautifulSoup as bs
import re
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
'limit': '20',
'sort_by': 'created'
}
webUrl="http://www.360doc.com/showweb/0/0/1104723360.aspx"
webFile=requests.get(webUrl)
webFile.encoding="utf-8"
data=webFile.text
print(data)
运行后,返回效果如下
<html>
<head><title>403 Forbidden</title></head>
<body bgcolor="white">
<center><h1>403 Forbidden</h1></center>
<hr><center>nginx</center>
</body>
</html>
推测原因,是你需要登录账号,才可以查看。
403 Forbidden是HTTP协议中的一个状态码(Status Code)。可以简单的理解为没有权限访问此站。该状态表示服务器理解了本次请求但是拒绝执行该任务,该请求不该重发给服务器。
既然这样,直接Ctrl + S,保存该网页所有信息。该网页图片也会全部保存下来,反而更有效率。
【心得】
爬取综合信息
网站中的邮箱号码
对于网易中的邮箱号码,也可以进行爬取。
def Gupiao():
htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
htmlfile.encoding='utf-8'
mysoup = BeautifulSoup(htmlfile.text,'html.parser')
mycontent=mysoup.prettify()
#print(type(mycontent))
#输出字符串的前面信息,便于观察整个网站构成
print(mycontent[:200])
print()
#寻找需要的信息,区分不同的语法
def Find():
myinfor=mysoup.find_all("a")
for i in myinfor:
tmp=i.get_text()
print(tmp)
print(i)
print()
print(i.prettify())
#print(myinfor)
#将需要的网站输出
def Wangzhan():
urlsList=[]
myinfor=mysoup.find_all("a")
for line in myinfor:
#print(line)
tmp=str(line)#line的类型是<class 'bs4.element.Tag'>
if "http" in tmp:
url=tmp.split('"')#将长的字符串切分,留下网站
urlsList.append(url[1])
print(line.get_text())#获得网站的标题
print(url[1])#输出网站字符串
Wangzhan()
Gupiao()
def Ceyan():
htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
htmlfile.encoding='utf-8'
mysoup = BeautifulSoup(htmlfile.text,'html.parser')
mycontent=mysoup.prettify()
#print(type(mycontent))
print(mycontent[:500])
print()
myinfor=mysoup.find_all("a")#<a href="http://www.163.com/">网易首页</a>,寻找的是属性,如a,如td,如tr,head,body,
for i in myinfor:
tmp=i.get_text()
print(tmp)
print(i)
print()
print(i.prettify())#用不同的语法格式,看看输出的效果如何。然后就知道各个语句的用法何在。prettify的作用是把密密麻麻的一行输出为整齐的几行,便于阅读。
#print(myinfor)
Ceyan()
在实践中,也会遇到问题。可以调整代码解决:
from bs4 import BeautifulSoup
import time
htmlFile=open("stock1.html","r",encoding="utf-8")
htmlContent=htmlFile.read()
#time.sleep(10) #暂停10秒
myBS=BeautifulSoup(htmlContent,"html.parser")
#print(myBS)
myLinks=myBS.find_all("a")
#print(myLinks)
for everyLink in myLinks:
myText=everyLink.get_text()
#print(myText)
if "163.com" not in myText:
print("test")
print(myText)
if "href" in everyLink.attrs:#属性attrs
print(f"{myText}:href={everyLink.attrs['href']}",sep="\t")
#print(myText,":href=",everyLink.attrs['href'],sep="\t")
'''
myBs=BeautifulSoup(htmlcontent,''html.parser'')
mylinks=myBs.find_all('a')
for everylink in mylinks:
mytext=everylink.get_text()
if '163.com' not in mytext:
if"href" in everylink.attrs:
print(mylink)
问题:为什么163.com还会出现呢?
运行结果:
网易首页:href=http://www.163.com/
新闻:href=http://news.163.com/
体育:href=http://sports.163.com/
NBA:href=http://sports.163.com/nba/
娱乐:href=http://ent.163.com/
财经:href=http://money.163.com/
股票:href=http://money.163.com/stock/
汽车:href=http://auto.163.com/
科技:href=http://tech.163.com/
'''
爬取之后,有序输出。
网站中的大学信息
读取网络文本
import requests
webFile=requests.get("http://www.pku.edu.cn")
webFile.encoding="utf-8"
webFile=webFile.text
print(webFile)
解析网页
import requests
response=requests.get('https://www.pku.edu.cn')
mycode=response.status_code
mycontent=response.content
分析所爬内容
with open(r"E:\pkuCode.txt","r",encoding="utf-8") as myFile:
data=myFile.readlines()
myList=list(data)
for i in myList:
print(i)
input()
解析对象
import requests
from bs4 import BeautifulSoup as bs
webFile=requests.get("https://www.pku.edu.cn")#爬虫获得html文件
webFile.encoding="utf-8"#爬虫解析网页文件
data=webFile.text#用text文档形式展现,解析为字符串
soup=bs(data,"html.parser")# 把网页解析为BeautifulSoup对象
soup.prettify()
items2=soup.find_all(class_="item")
myFile=open(r"E:\mySchoolLink.txt","w",encoding="utf-8")
for everyTag in items2:
#print(everyTag)
print(file=myFile)
print("文字部分",file=myFile)
myText=everyTag.get_text()
print(myText,file=myFile)
print(file=myFile)
print("链接部分",file=myFile)
myLinks=everyTag.find_all("a")#everyLink是BS 中的tag
for everyLink in myLinks:
if "href" in everyLink.attrs:#attrs只有在BS 中tag中才可以用。
print(everyLink.attrs,file=myFile)
myFile.close()
可以用requests ,将对象存储下来:
import requests
file1=requests.get("https://www.pku.edu.cn")
file1.encoding="utf-8"
data=file1.text
myFile=open(r"E:\pkuCode.txt","w",encoding="utf-8")
print(data,file=myFile)
myFile.close()
'''
soup的数据类型是<class 'bs4.BeautifulSoup'>,说明soup是一个BeautifulSoup对象
打印的soup,是所请求网页的完整HTML源代码
虽然response.text和soup打印出的内容表面上看长得一模一样,却有着不同的内心,它们属于不同的类:<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串,后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本,是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的str方法,所以直接打印 bs 对象显示字符串是str的返回结果
'''
网站中的音乐
import requests
from bs4 import BeautifulSoup as bs
headers = {
'origin':'https://y.qq.com',
# 请求来源,本案例中其实是不需要加这个参数的,只是为了演示
'referer':'https://y.qq.com/n/yqq/song/004Z8Ihr0JIu5s.html',
# 请求来源,携带的信息比“origin”更丰富,本案例中其实是不需要加这个参数的,只是为了演示
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
# 标记了请求从什么设备,什么浏览器上发出
}
# 伪装请求头
url1='''
https://c.y.qq.com/soso/fcgi-bin/client_search_cp?
ct=24&qqmusic_ver=1298&
new_json=1&remoteplace=txt.yqq.song&
searchid=57068364391640558&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&
'''
url2="p=1"
url3="""
&n=10&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0
"""
#注意,这个网址是在网页检查过程中找到并复制的,针对文本所在内容的网址,而不是qq音乐的官网。详情参看风变编程笔记。
for i in range(1,10):
url=url1+"p="+str(i)+url3
webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text
jsonMusic=webFile.json()
listMusic = jsonMusic['data']['song']['list']
for i in listMusic:
print("专辑名:",i["albumname"])
print("歌曲名:",i["songname"])
print('播放时长:'+str(i['interval'])+'秒')
print('播放链接:https://y.qq.com/n/yqq/song/'+i['media_mid']+'.html\n\n')
import requests
from bs4 import BeautifulSoup as bs
myHref="https://y.qq.com/n/yqq/singer/0025NhlN2yWrP4.html"
webFile=requests.get(myHref)
data=webFile.text
soup=bs(data,"html.parser")
print("""class_=js_song""")
items1=soup.find_all(class_="js_song")
count=0
for everyLink in items1:
myText=everyLink.get_text()
print("everyLink : ","\n",everyLink)
print("myText:","\n",myText)
print("everyLink.attrs:","\n",everyLink.attrs)
print(everyLink.attrs["href"])
count+=1
if count==1:
break
print()
print("""class_=songlist__songname_txt""")
items2=soup.find_all(class_="songlist__songname_txt")
count=0
for everyLink in items2:
myText=everyLink.get_text()
print("everyLink : ","\n",everyLink)
print("myText:","\n",myText)
print("everyLink.attrs:","\n",everyLink.attrs)
print(everyLink.attrs["class"])
count+=1
if count==1:
break
'''
if "href" in everyLink.attrs:#属性attrs
print(f"{myText}:href={everyLink.attrs['href']}",sep="\t")
print(myText,":href=",everyLink.attrs['href'],sep="\t")
注意,bs提取的信息,class很关键。筛选的东西,之后会形成一个字典。
如果筛选的范围是链接范围,everyLink.attrs["href"]就会出现链接。
如果筛选的范围是文本范文,就只能写成everyLink.attrs["class"]
'''
import requests
from bs4 import BeautifulSoup as bs
webURL="""
https://c.y.qq.com/soso/fcgi-bin/client_search_cp?
ct=24&qqmusic_ver=1298&
new_json=1&remoteplace=txt.yqq.song&
searchid=57068364391640558&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&
p=1
&n=10&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0
"""
webFile=requests.get(webURL)
webFile.encoding="utf-8"
data=webFile.text
jsonFile=webFile.json()
##print(type(jsonFile))#<class 'dict'>使用json()方法,将对象转为列表/字典
##for (k,v) in jsonFile.items():
## print(k)
musicData=jsonFile["data"]#注意,文中是引号-字符串,那么得用引号,如果写成jsonFile[data]是没有用的
##print(type(musicData))
##for (k,v) in musicData.items():
## print(k)
listMusic=musicData["song"]["list"]
print(type(listMusic))
for music in listMusic:
print("播放专辑:",music["album"]["name"])
print('播放时长:'+str(music['interval'])+'秒') # 查找播放时长
print('播放链接:https://y.qq.com/n/yqq/song/' +music['mid']+'.html\n\n')
input()
##
##soup=bs(data,"html.parser")
##print(type(soup))#<class 'bs4.BeautifulSoup'>
import requests
from bs4 import BeautifulSoup as bs
import openpyxl
workBook=openpyxl.Workbook()
sheet1=workBook.active
sheet1.title="qq音乐链接表"
url="https://y.qq.com/n/yqq/singer/000FzZ3q3kxTMG.html"
webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text
soup=bs(data,"html.parser")
sheet1.append(["footer_platform_list__item"])
Tag1=soup.find_all(class_="footer_platform_list__item")
for everyTag in Tag1:
myText=everyTag.get_text()
myLinks=everyTag.find_all("a")
for i in myLinks:
if "href" in i.attrs:
myList1=[myText,i['href']]
print(myList1)
sheet1.append(myList1)
sheet1.append(["footer_link"])
Tag2=soup.find_all(class_="footer_link")
for everyTag in Tag2:
myText=everyTag.get_text()
myLinks=everyTag.find_all("a")
for i in myLinks:
if "href" in i.attrs:
myList2=[myText,i["href"]]
print(myList2)
sheet1.append(myList2)
workBook.save("积累文档-QQ音乐网络链接.xlsx")
import requests
from bs4 import BeautifulSoup as bs
import openpyxl
workBook=openpyxl.Workbook()
sheet1=workBook.active
sheet1.title="qq音乐链接表"
url="https://y.qq.com/n/yqq/singer/000FzZ3q3kxTMG.html"
webFile=requests.get(url)
webFile.encoding="utf-8"
data=webFile.text
soup=bs(data,"html.parser")
myClass=['footer_platform_list__item','footer_link','footer_download','footer_copyright','footer_platform',"footer_download"]
for everyClass in myClass:
print(everyClass)
sheet1.append([everyClass])
Tag1=soup.find_all(class_=everyClass)
for everyTag in Tag1:
myText=everyTag.get_text()
myLinks=everyTag.find_all("a")
for i in myLinks:
if "href" in i.attrs:
myList1=[myText,i["href"]]
print(myList1)
sheet1.append(myList1)
workBook.save("积累文档-QQ音乐链接简练版.xlsx")
网站中的题目
#爬取网站上的题目
from bs4 import BeautifulSoup
import time
import requests
def Pachong():
for pageNum in range(1,17):
htmlFile=requests.get('http://vers.cqvip.com/view/course/subject/list.aspx?stid=ad29646905394d96a50c1818329fb4f6&cid=120&searchkey='+str(pageNum))
htmlFile.encoding='utf-8'
soup = BeautifulSoup(htmlFile.text,'html.parser')
print(soup)
input()
#Pachong()
htmlFile=requests.get('http://vers.cqvip.com/view/course/subject/list.aspx?stid=ad29646905394d96a50c1818329fb4f6&cid=120&searchkey=2')
htmlFile.encoding='utf-8'
print(htmlFile)
def PaTi():
htmlfile=requests.get("http://vers.cqvip.com/view/course/chapter/detail.aspx?cid=125&chapter=%E6%98%8E%E4%BB%A3%E6%96%87%E5%AD%A6")
htmlfile.encoding='utf-8'
mysoup=BeautifulSoup(htmlfile.text,'html.parser')
mycontent1=mysoup.prettify()
print(mycontent1[:100])
mycontent2=mysoup.smooth()
print(mycontent2)
print("OK")
mycontent3=mysoup.select_one("div")
print(mycontent3)
print("Next")
print()
myinfor=mysoup.find("div").find_all("strong")
print(myinfor)
tmp=mysoup.find_next_sibling("div")
print(tmp)
#class="q-box"
PaTi()
def Gupiao():
htmlfile=requests.get("http://quotes.money.163.com/trade/lsjysj_zhishu_000001.html?year=")
htmlfile.encoding='utf-8'
mysoup = BeautifulSoup(htmlfile.text,'html.parser')
mycontent=mysoup.prettify()
#print(type(mycontent))
#输出字符串的前面信息,便于观察整个网站构成
print(mycontent[:200])
print()
#寻找需要的信息,区分不同的语法
def Find():
myinfor=mysoup.find_all("a")
for i in myinfor:
tmp=i.get_text()
print(tmp)
print(i)
print()
print(i.prettify())
#print(myinfor)
#将需要的网站输出
def Wangzhan():
urlsList=[]
myinfor=mysoup.find_all("a")
for line in myinfor:
#print(line)
tmp=str(line)#line的类型是<class 'bs4.element.Tag'>
if "http" in tmp:
url=tmp.split('"')#将长的字符串切分,留下网站
urlsList.append(url[1])
print(line.get_text())#获得网站的标题
print(url[1])#输出网站字符串
Wangzhan()
#Gupiao()
网站中的博客内容
知乎中的文章
import requests
import csv
#引用csv。
csv_file=open('articles.csv','w',newline='',encoding='utf-8')
#调用open()函数打开csv文件,传入参数:文件名“articles.csv”、写入模式“w”、newline=''。
writer = csv.writer(csv_file)
# 用csv.writer()函数创建一个writer对象。
list2=['标题','链接','摘要']
#创建一个列表
writer.writerow(list2)
#调用writer对象的writerow()方法,可以在csv文件里写入一行文字 “标题”和“链接”和"摘要"。
headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
url='https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?'
offset=0
#设置offset的起始值为0
while True:
params={
'include':'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
'offset':str(offset),
'limit':'20',
'sort_by':'voteups',
}
#封装参数
res=requests.get(url,headers=headers,params=params)
#发送请求,并把响应内容赋值到变量res里面
articles=res.json()
print(articles)
data=articles['data']
#定位数据
for i in data:
list1=[i['title'],i['url'],i['excerpt']]
#把目标数据封装成一个列表
writer.writerow(list1)
#调用writerow()方法,把列表list1的内容写入
offset=offset+20
#在while循环内部,offset的值每次增加20
if offset > 40:
break
csv_file.close()
#写入完成后,关闭文件就大功告成
print('okay')
import requests
from bs4 import BeautifulSoup as bs
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
params = {
'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
'limit': '20',
'sort_by': 'created'
}
url2="https://www.zhihu.com/org/jing-ji-ri-bao-xin-wen-ke-hu-duan/posts"
webFile=requests.get(url2,params=params,headers=headers)
webFile.encoding="utf-8"
data=webFile.text
soup=bs(data,"html.parser")
preData=soup.prettify()
items2=soup.find_all(class_="item")
for iTag in items2:
for i in iTag.find_all():
print(i)
爬取博客
from urllib3 import *
from re import *
http=PoolManager()
#禁止显示警告信息
disable_warnings()
#下载url对应web页面
url="https://www.cnblogs.com/"
result=http.request("GET",url)
htmlStr=result.data.decode("utf-8")
print(htmlStr)
#分析html代码
#通过正则表达式,获取所有关于目标的信息
#<a class="post-item-title" href="https://www.cnblogs.com/hzoi-fengwu/p/14922218.html" target="_blank">STL----vector注意事项</a>
aList=findall('<a[^>] *post-item-title[^>]*>[^<]*</a>',htmlStr)
result=[]
#提取每一个<a后面的url
for a in aList:
#利用正则表达式提取href后面的url
g=search('href[\s]*=[\s]*[/"][\]',a)
if g!=None:
url=g.group(1)
#得到url
print(url)
爬取博客标题-爬虫-正则表达式部分
网站中的词典
#网络爬虫进阶urllib.request
def ilovefish():
import urllib.request
myResponse=urllib.request.urlopen("https://ilovefishc.com/")#打开网页,获取信息
myHtml=myResponse.read()#读出数据
#print(myHtml)
myHtml=myHtml.decode("utf-8")#将二进制解码,按照网页信息<head> <meta charset="UTF-8">选择解码格式utf-8
#print(myHtml)
def placekitten():
#placekitten.com
import urllib.request
myResponse=urllib.request.urlopen("http://placekitten.com/500/600")#打开网页,获取信息
my_cat_img=myResponse.read()#读出数据
with open('cat_500_600.jpg','wb') as f:
f.write(my_cat_img)
def myrequest():
#urllib.request():This function always returns an object which can work as a context manager and has methods such as
#geturl() — return the URL of the resource retrieved, commonly used to determine if a redirect was followed
#info() — return the meta-information of the page, such as headers, in the form of an email.message_from_string() instance (see Quick Reference to HTTP Headers)
#getcode() – return the HTTP status code of the response.
import urllib.request
myresponse=urllib.request.urlopen("http://placekitten.com/300/500")
myurl=myresponse.geturl()
print(myurl)
print(myresponse.info())
print(myresponse.getcode())
def Cidan():
#小甲鱼将有道辞典功能提取出程序
import urllib.request()
url='http://fanyi.youdao.com/'
data={}
my_response=urllib.request.urlopen(url,data)
【心得】