python爬虫——百度文本审核技术

2026-02-20 05:33:00

1、目标:在终端输入文字,获取网页响应结果。

网址:https://ai.baidu.com/tech/textcensoring

F12打开测试工具,选择Natwork,然后在网页中输入任意文字,点击分析,可以发现响应列表中多出了一个叫aidemo的请求

python爬虫——百度文本审核技术

2、显而易见,这个请求就是api接口~直接调用接口即可~

下面上代码:

import requests

url = 'https://ai.baidu.com/aidemo'

text = input('输入需要审核的文字(最多200字):   ')

data = {'content': text,

'type': 'textcensor',

'apiType': 'censor',

'requestTime': '1628780167667',

'token': '2fe1cba870'}

headers = {'accept': '*/*',

'accept-encoding': 'gzip, deflate, br',

'accept-language': 'zh-CN,zh;q=0.9',

'content-length': '103',

'content-type': 'application/x-www-form-urlencoded',

'cookie': 'BAIDUID_BFESS=EAB301E34BAB5B3829977A4FDE00ED44:FG=1; BAIDUID=EAB301E34BAB5B385D72A84F2B5F0276:FG=1; BIDUPSID=EAB301E34BAB5B385D72A84F2B5F0276; PSTM=1628757735; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; __yjs_duid=1_cc834c6c33b9da38de2c8e4c95cf28491628775855270; Hm_lvt_8b973192450250dd85b9011320b455ba=1628775852; Hm_lpvt_8b973192450250dd85b9011320b455ba=1628780471; ab_sr=1.0.1_YjI5OTFlOWQ5OWI1ZjIwM2NmN2U5M2QyMTFiYjY2NWUxY2QxMWJhOTliOGYyN2Q1YWYwM2JhY2ZmZDU0NmQ4NTU1ZTJiNDRiN2RjMWRjOTNjYjdiNTE0YmFiOGQ3ZDE4NjljNmQ0MDk3NTg2N2QyZDJiZDA0OGI4MDdmYzRlYzkzMDI3M2Q1NWJhMzgwMzdhZDE1MGUyNGQ2MWM5MmMwMw==; __yjs_st=2_MjYxNDkyMTMwMWNiNzM2ZGU4NDM0NmQ4YTcyMTM2MTgxMTU0OWY5NzU4NzY2NjI0OTRiNTkyN2VjZjg0MDIwMjBiMWY1Zjg1YzFjNjNiY2QxNzliY2JlZWU0N2YwNGJlYjA5YzhiYzk0NGRhNTMxNGJjMjYxMjE1M2Y5YjJhNWQ1ZDE4NzFlMTNlM2VmODc2MTNlY2NlN2NlMDRkMDE1NDA2NjI1MzY5YmU0YTY4Mzc1YWYzYjRiNzUxZDRiYmJkMzRlODIyMTFiYWE3YjhjMjlhM2RmZDZkOTQ5OGRlMmQ5MjRkNWM2YWJlNDNkM2JjNmYzNzlmMTk4YmYzMmQxYl83X2U4YzNlODYy',

'origin': 'https://ai.baidu.com',

'referer': 'https://ai.baidu.com/tech/textcensoring',

'sec-fetch-dest': 'empty',

'sec-fetch-mode': 'cors',

'sec-fetch-site': 'same-origin',

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

res = requests.post(url,data=data,headers=headers)

这样请求部分代码就写好了

3、有时候会出现服务器连接失败的情况,所以要加上报错处理:

def posts():

    global js,res

    res = requests.post(url,data=data,headers=headers)

    js = res.json()

try:

    posts()

except:

    posts()

这样就可以一直请求直到成功

python爬虫——百度文本审核技术

4、最后处理数据,请求返回的是一个json数据,提取数据并四舍五入即可:

rights = js['data']['result']['pass']

wrongs = js['data']['result']['reject']

for worng in wrongs:

    y = round(float(worng['score']),3)

    print('{}   分数:{}'.format(y))

    print('\n')

print('--------------------------------------------')

for right in rights:

    y = round(float(right['score']),3)

    print('{}   分数:{}'.format(y))

    print('\n')

5、对应上中文,创建一个字典:

dic = {'1':'暴恐违禁','2':'文本色情','3':'政治敏感','4':'恶意推广','5':'低俗辱骂','6':'低质灌水'}

把数据和文字相结合:

rights = js['data']['result']['pass']

wrongs = js['data']['result']['reject']

for worng in wrongs:

    x = dic[str(worng['label'])]

    y = round(float(worng['score']),3)

    print('{}   分数:{}'.format(x,y))

    print('\n')

print('--------------------------------------------')

for right in rights:

    x = dic[str(right['label'])]

    y = round(float(right['score']),3)

    print('{}   分数:{}'.format(x,y))

    print('\n')

6、代码总体写好了,还需要一个总结,创建yesornot变量:

if yesornot == True:

    print('综合评价:审核不通过')

else:

    print('综合评价:审核通过')

7、全部代码:

import requests

url = 'https://ai.baidu.com/aidemo'

text = input('输入需要审核的文字(最多200字):   ')

data = {'content': text,

'type': 'textcensor',

'apiType': 'censor',

'requestTime': '1628780167667',

'token': '2fe1cba870'}

headers = {'accept': '*/*',

'accept-encoding': 'gzip, deflate, br',

'accept-language': 'zh-CN,zh;q=0.9',

'content-length': '103',

'content-type': 'application/x-www-form-urlencoded',

'cookie': 'BAIDUID_BFESS=EAB301E34BAB5B3829977A4FDE00ED44:FG=1; BAIDUID=EAB301E34BAB5B385D72A84F2B5F0276:FG=1; BIDUPSID=EAB301E34BAB5B385D72A84F2B5F0276; PSTM=1628757735; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; __yjs_duid=1_cc834c6c33b9da38de2c8e4c95cf28491628775855270; Hm_lvt_8b973192450250dd85b9011320b455ba=1628775852; Hm_lpvt_8b973192450250dd85b9011320b455ba=1628780471; ab_sr=1.0.1_YjI5OTFlOWQ5OWI1ZjIwM2NmN2U5M2QyMTFiYjY2NWUxY2QxMWJhOTliOGYyN2Q1YWYwM2JhY2ZmZDU0NmQ4NTU1ZTJiNDRiN2RjMWRjOTNjYjdiNTE0YmFiOGQ3ZDE4NjljNmQ0MDk3NTg2N2QyZDJiZDA0OGI4MDdmYzRlYzkzMDI3M2Q1NWJhMzgwMzdhZDE1MGUyNGQ2MWM5MmMwMw==; __yjs_st=2_MjYxNDkyMTMwMWNiNzM2ZGU4NDM0NmQ4YTcyMTM2MTgxMTU0OWY5NzU4NzY2NjI0OTRiNTkyN2VjZjg0MDIwMjBiMWY1Zjg1YzFjNjNiY2QxNzliY2JlZWU0N2YwNGJlYjA5YzhiYzk0NGRhNTMxNGJjMjYxMjE1M2Y5YjJhNWQ1ZDE4NzFlMTNlM2VmODc2MTNlY2NlN2NlMDRkMDE1NDA2NjI1MzY5YmU0YTY4Mzc1YWYzYjRiNzUxZDRiYmJkMzRlODIyMTFiYWE3YjhjMjlhM2RmZDZkOTQ5OGRlMmQ5MjRkNWM2YWJlNDNkM2JjNmYzNzlmMTk4YmYzMmQxYl83X2U4YzNlODYy',

'origin': 'https://ai.baidu.com',

'referer': 'https://ai.baidu.com/tech/textcensoring',

'sec-fetch-dest': 'empty',

'sec-fetch-mode': 'cors',

'sec-fetch-site': 'same-origin',

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

def posts():

    global js,res

    res = requests.post(url,data=data,headers=headers)

    js = res.json()

try:

    posts()

except:

    posts()

print(res,'\n')

dic = {'1':'暴恐违禁','2':'文本色情','3':'政治敏感','4':'恶意推广','5':'低俗辱骂','6':'低质灌水'}

rights = js['data']['result']['pass']

wrongs = js['data']['result']['reject']

yesornot = False

for worng in wrongs:

    yesornot = True

    x = dic[str(worng['label'])]

    y = round(float(worng['score']),3)

    print('{}   分数:{}'.format(x,y))

    print('\n')

print('--------------------------------------------')

for right in rights:

    x = dic[str(right['label'])]

    y = round(float(right['score']),3)

    print('{}   分数:{}'.format(x,y))

    print('\n')

if yesornot == True:

    print('综合评价:审核不通过')

else:

    print('综合评价:审核通过')

猜你喜欢