first commit

This commit is contained in:
2026-01-14 17:19:21 +08:00
parent 10cee828d7
commit 88e5bb418a
5 changed files with 196 additions and 1 deletions

View File

@@ -1,2 +1,14 @@
# football-data
# football_data
统计一些足球数据
## 爬取足球赛录像
football_game.py
## 计算球员赛季正负值
足球引入正负值的概念,参考视频:<https://www.bilibili.com/video/BV1kYBBYEEW7/>
football_data.py是对视频中计算方法的实现首先爬取指定网页的表格数据然后进行可视化。巴塞罗那23/24赛季球员正负值展示如下
![image-20241125104039979](./assets/image-20241125104039979.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 82 KiB

78
football_data.py Normal file
View File

@@ -0,0 +1,78 @@
import requests
from lxml import etree
import pandas as pd
import matplotlib.pyplot as plt
url = f'https://www.transfermarkt.com/manchester-city/leistungsdaten/verein/281/plus/1?reldata=%262023'
myheader = {'user-agent': 'Chrome'}
response = requests.get(url, headers=myheader)
# response.encoding = 'utf-8'
html = response.content
tree = etree.HTML(html)
players = tree.xpath('//tbody/tr')
# 提取数据
player_data = []
for player in players:
# 提取球员名字
name = player.xpath('.//img/@title')
name = name[0] if name else None
# 提取 PPG
ppg = player.xpath('.//td[contains(@class, "cp")]/text()')
ppg = ppg[0] if ppg else None
ppg = eval(str(ppg))
# 去掉未出场的球员
if ppg == 0:
continue
# 提取出场时间
playing_time = player.xpath('.//td[contains(@class, "rechts")]/text()')
playing_time = playing_time[0].replace('.', '').strip("'") if playing_time else None
playing_time = eval(str(playing_time))
# 如果 PPG 或出场时间缺失,跳过该球员
if name and ppg and playing_time:
player_data.append({
"name": name,
"PPG": ppg,
"playing_time": playing_time
})
# 将数据存储到 DataFrame
df = pd.DataFrame(player_data)
# 添加 "Matches" 列,出场时间除以 90保留两位小数
df['Matches'] = (df['playing_time'] / 90).round(2)
# 输出结果
print(df)
# 绘制散点图
plt.figure(figsize=(8, 6))
plt.scatter(df['Matches'], df['PPG'], color='blue', label='Player Data')
# 添加球员名字标注
for i, row in df.iterrows():
plt.text(row['Matches'], row['PPG'], row['name'], fontsize=9, ha='right', va='bottom')
# 添加分割线
split_x = 30
split_y = 2.37
plt.axhline(y=split_y, color='red', linestyle='--', linewidth=1.5, label='PPG Split (2.13)')
plt.axvline(x=split_x, color='green', linestyle='--', linewidth=1.5, label='Matches Split (30)')
# 设置标题和标签
plt.title("Scatter Plot with Player Names and Splitting Lines", fontsize=14)
plt.xlabel("Matches (Games Played)", fontsize=12)
plt.ylabel("PPG (Points Per Game)", fontsize=12)
# 添加网格
plt.grid(alpha=0.3)
# 添加图例
plt.legend()
# 显示图形
plt.show()

75
football_game.ipynb Normal file
View File

@@ -0,0 +1,75 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from lxml import etree\n",
"\n",
"key='AC米兰'\n",
"\n",
"url = f'https://www.zhibo8.com/schedule/finish_more.htm'\n",
"myheader = {'user-agent': 'Chrome'}\n",
"response = requests.get(url, headers=myheader)\n",
"# response.encoding = 'utf-8'\n",
"html = response.content\n",
"element = etree.HTML(html)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://news.zhibo8.com/zuqiu/2024-10-30/match1448348date2024vnative.htm\n"
]
}
],
"source": [
"span_elements = element.xpath('//li/span[@class=\"_teams\"]')\n",
"for span in span_elements:\n",
" team=span.text\n",
" # print(team)\n",
" if key in team:\n",
" # print(team)\n",
" url_jijin=span.xpath('../a')[-1].get('href')\n",
" print(url_jijin)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

30
football_game.py Normal file
View File

@@ -0,0 +1,30 @@
import requests
from lxml import etree
# 只能搜索左边的队伍
key = '伊普斯维奇'
url = f'https://www.zhibo8.com/schedule/finish_more.htm'
myheader = {'user-agent': 'Chrome'}
response = requests.get(url, headers=myheader)
response.encoding = 'utf-8'
html = response.content
element = etree.HTML(html)
# 搜索定位(取第一个)
span_elements = element.xpath('//li/span[@class="_teams"]')
for span in span_elements:
team = span.text
# print(team)
if key in team:
break
a_elements = span.xpath('../a')
for a in a_elements:
url_jijin = a.get('href')
url_luxiang = 'https://www.zhibo8.com' + \
url_jijin.replace('jijin', 'luxiang')
print(url_jijin)
print(url_luxiang)
# https://www.zhibo8.com/zuqiu/2024/1030-match1448348v-luxiang.htm