re 是 Python 标准库中用于正则表达式操作的模块,支持字符串的模式匹配、搜索、替换、分割等功能,适用于数据清洗、文本解析、输入验证等场景。

核心方法包括:

re库示例代码

import re
# 1. 从开头匹配
m = re.match(r'Hello', 'Hello World')
if m:
   print("match:", m.group()) # Hello
# 2. 搜索任意位置
s = re.search(r'World', 'Hello World')
print("search:", s.group()) # World
# 3. 查找所有匹配
print(re.findall(r'\\d+', 'abc123xyz456')) # ['123', '456']
# 4. 替换
text = "Price: $100"
print(re.sub(r'\\$(\\d+)', r'¥\\1', text)) # Price: ¥100
# 5. 分割
print(re.split(r'[,;]\\s*', 'apple, orange; banana'))
# ['apple', 'orange', 'banana']
# 6. 编译复用
pattern = re.compile(r'\\b\\w+ly\\b')
print(pattern.findall("quickly and carefully")) # ['quickly', 'carefully']

使用示例

import xlwt
import re
import requests
import pandas as pd
#  创建工作簿
workBook = xlwt.Workbook("UTF-8")
#  创建工作表
oneWorkSheet = workBook.add_sheet("sheet1")
#  录入种子
base_url = '<https://www.cikeee.com/wangri>'
#  发送 GET 请求
res = requests.get(base_url)
#  转换格式为国际通用格式
res.encoding = 'utf-8'
#  正则爬取内容
reg1 = re.findall(r"<span homeworks='mov-title'>(.*?)</span>", res.text)
reg2 = re.findall(r"<span homeworks='mov-info'>(.*?) .*?</span>", res.text)
reg3 = re.findall(r"<span homeworks='mov-info'>.*? (.*?) .*?</span>", res.text)
reg4 = re.findall(r"<span homeworks='mov-info'>.*? .*? (.*?) .*?</span>", res. text)
reg5 = re.findall(r"<span homeworks='mov-info'>.*? .*? .*? (.*?)</span>", res. text)
#  设置标题
oneWorkSheet.write(0, 0, "电影名")
oneWorkSheet.write(0, 1, "电影评分")
oneWorkSheet.write(0, 2, "电影类型")
oneWorkSheet.write(0, 3, "电影年代")
oneWorkSheet.write(0, 4, "起源地")
#  按顺序写入表格
j = 1
for i in reg1:
    oneWorkSheet.write(j, 0, i)
    j += 1
j = 1
for i in reg2:
    oneWorkSheet.write(j, 1, i)
    j += 1
j = 1
for i in reg3:
    oneWorkSheet.write(j, 2, i)
    j += 1
j = 1
for i in reg4:
    oneWorkSheet.write(j, 3, i)
    j += 1
j = 1
for i in reg5:
    oneWorkSheet.write(j, 4, i)
    j += 1
workBook.save("电影日历-往日推荐.xls")
#  添加按评分排序功能
Choose = input("是否按评分进行排序(请输入yes或no):")
if Choose == "yes":
    df = pd.read_excel("D:\\\\pycharm\\\\python_Project_01\\\\爬虫\\\\爬虫作业及尝试\\\\电影日历-往日推荐.xls")
    df.sort_values(by='电影评分', inplace=True, ascending=False)
    df.to_excel('电影日历-往日推荐.xls')