Init
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
*.xlsx
|
||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
151
main.py
Normal file
151
main.py
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from io import StringIO
|
||||||
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from openpyxl.styles import Font, PatternFill
|
||||||
|
from openpyxl.utils import get_column_letter
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/122.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
SCORE_LABELS = [
|
||||||
|
"Single-Core Score",
|
||||||
|
"Multi-Core Score",
|
||||||
|
"Metal Score",
|
||||||
|
"OpenCL Score",
|
||||||
|
"Vulkan Score",
|
||||||
|
"CUDA Score",
|
||||||
|
]
|
||||||
|
|
||||||
|
def clean_sheet_name(name: str) -> str:
|
||||||
|
name = re.sub(r'[\\/*?:\[\]]', "_", name)
|
||||||
|
return name[:31] or "Sheet"
|
||||||
|
|
||||||
|
def fetch_html(url: str) -> str:
|
||||||
|
r = requests.get(url, headers=HEADERS, timeout=20)
|
||||||
|
r.raise_for_status()
|
||||||
|
return r.text
|
||||||
|
|
||||||
|
def extract_text_score(text: str, label: str):
|
||||||
|
patterns = [
|
||||||
|
rf"(\d[\d,\.]*)\s+{re.escape(label)}",
|
||||||
|
rf"{re.escape(label)}\s+(\d[\d,\.]*)",
|
||||||
|
]
|
||||||
|
for pattern in patterns:
|
||||||
|
m = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
return m.group(1).replace(",", "")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_tables(html: str):
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
blocks = []
|
||||||
|
current_section = "General"
|
||||||
|
|
||||||
|
for tag in soup.find_all(["h1", "h2", "h3", "h4", "table"]):
|
||||||
|
if tag.name in {"h1", "h2", "h3", "h4"}:
|
||||||
|
current_section = " ".join(tag.get_text(" ", strip=True).split())
|
||||||
|
elif tag.name == "table":
|
||||||
|
try:
|
||||||
|
df = pd.read_html(StringIO(str(tag)))[0]
|
||||||
|
df = df.dropna(how="all").dropna(axis=1, how="all")
|
||||||
|
blocks.append((current_section, df))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
title = soup.title.get_text(" ", strip=True) if soup.title else "Geekbench Result"
|
||||||
|
text = soup.get_text("\n", strip=True)
|
||||||
|
return title, text, blocks
|
||||||
|
|
||||||
|
def detect_scores(text: str):
|
||||||
|
scores = {}
|
||||||
|
for label in SCORE_LABELS:
|
||||||
|
scores[label] = extract_text_score(text, label)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
def write_block(ws, start_row, title, df):
|
||||||
|
ws.cell(start_row, 1, title)
|
||||||
|
ws.cell(start_row, 1).font = Font(bold=True)
|
||||||
|
start_row += 1
|
||||||
|
|
||||||
|
for c, col in enumerate(df.columns, start=1):
|
||||||
|
cell = ws.cell(start_row, c, str(col))
|
||||||
|
cell.font = Font(bold=True)
|
||||||
|
cell.fill = PatternFill("solid", fgColor="D9EAF7")
|
||||||
|
|
||||||
|
for r_idx, row in enumerate(df.itertuples(index=False), start=start_row + 1):
|
||||||
|
for c_idx, value in enumerate(row, start=1):
|
||||||
|
ws.cell(r_idx, c_idx, "" if pd.isna(value) else str(value))
|
||||||
|
|
||||||
|
return start_row + len(df) + 3
|
||||||
|
|
||||||
|
def autofit(ws):
|
||||||
|
for col_cells in ws.columns:
|
||||||
|
max_len = 0
|
||||||
|
col_idx = col_cells[0].column
|
||||||
|
for cell in col_cells:
|
||||||
|
value = "" if cell.value is None else str(cell.value)
|
||||||
|
max_len = max(max_len, len(value))
|
||||||
|
ws.column_dimensions[get_column_letter(col_idx)].width = min(max_len + 2, 60)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: python geekbench_to_excel.py output.xlsx <url1> <url2> ...")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
output_file = Path(sys.argv[1])
|
||||||
|
urls = sys.argv[2:]
|
||||||
|
|
||||||
|
print(f"Processing {len(urls)} URLs...")
|
||||||
|
|
||||||
|
overview_rows = []
|
||||||
|
|
||||||
|
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
||||||
|
for i, url in enumerate(urls, start=1):
|
||||||
|
html = fetch_html(url)
|
||||||
|
title, text, blocks = parse_tables(html)
|
||||||
|
scores = detect_scores(text)
|
||||||
|
|
||||||
|
sheet_name = clean_sheet_name(f"{i}_{title.split('-')[0].strip()}")
|
||||||
|
workbook = writer.book
|
||||||
|
ws = workbook.create_sheet(sheet_name)
|
||||||
|
|
||||||
|
ws["A1"] = title
|
||||||
|
ws["A1"].font = Font(bold=True, size=14)
|
||||||
|
ws["A2"] = url
|
||||||
|
|
||||||
|
row = 4
|
||||||
|
for section_title, df in blocks:
|
||||||
|
row = write_block(ws, row, section_title, df)
|
||||||
|
|
||||||
|
ws.freeze_panes = "A4"
|
||||||
|
autofit(ws)
|
||||||
|
|
||||||
|
overview_rows.append({
|
||||||
|
"Titel": title,
|
||||||
|
"URL": url,
|
||||||
|
**scores
|
||||||
|
})
|
||||||
|
|
||||||
|
overview_df = pd.DataFrame(overview_rows)
|
||||||
|
overview_df.to_excel(writer, sheet_name="Overview", index=False)
|
||||||
|
|
||||||
|
ws = writer.book["Overview"]
|
||||||
|
for cell in ws[1]:
|
||||||
|
cell.font = Font(bold=True)
|
||||||
|
cell.fill = PatternFill("solid", fgColor="B7DEE8")
|
||||||
|
ws.freeze_panes = "A2"
|
||||||
|
autofit(ws)
|
||||||
|
|
||||||
|
if "Sheet" in writer.book.sheetnames and len(writer.book.sheetnames) > 1:
|
||||||
|
del writer.book["Sheet"]
|
||||||
|
|
||||||
|
print(f"Finished: {output_file}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user