first version

This commit is contained in:
uskovgs 2025-06-08 01:53:57 +03:00
parent cac9701e3b
commit 1c66157b68
4 changed files with 310 additions and 41 deletions

4
.gitignore vendored
View File

@ -1,4 +1,6 @@
*.pyc *.pyc
*.pyo *.pyo
__pycache__/ __pycache__/
.quarto .quarto
reference/
tests/

View File

@ -18,3 +18,96 @@ baskets = list_baskets(sess)
# download basket ART-XC agns # download basket ART-XC agns
df_basket = get_basket(sess, basket='ART-XC agns') df_basket = get_basket(sess, basket='ART-XC agns')
from srgweb.artsurvey import (
artsurvey_session,
get_artsurvey_cat
)
import keyring
sess = artsurvey_session(
"uskov",
keyring.get_password("SRG_ARTSURVEY", ""),
base_url="http://10.5.2.25/"
)
base_url = sess.base_url
settings_url = f"{base_url.rstrip('/')}/artsurvey"
resp = sess.get(settings_url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, "html.parser")
form = soup.find("form")
select = form.find("select", {"id": "id_survey"})
survey_options = {
option.text.strip(): option.get("value")
for option in select.find_all("option")
if option.get("value")
}
# 3. Объединяем параметры
params = {**default_artsurvey_settings(), **{}}
params["survey"] = survey_options['S1-5v12345.12']
# 4. Подготовка payload (фильтры, которые реально присутствуют в форме)
from bs4 import BeautifulSoup, Tag
import requests
def build_payload(form: Tag, overrides: dict[str, str]) -> dict[str, str]:
"""
Собирает payload **из всех** полей формы + перекрывает overrides.
Поведение 1-в-1 как rvest::html_form_set().
"""
payload: dict[str, str] = {}
for el in form.find_all(["input", "select", "textarea"]):
name = el.get("name")
if not name:
continue
tag_type = el.get("type", "").lower()
value: str
# --- SELECT -------------------------------------------------
if el.name == "select":
selected = el.find("option", selected=True)
value = selected.get("value") if selected else el.find("option").get("value", "")
# --- CHECKBOX / RADIO --------------------------------------
elif tag_type in {"checkbox", "radio"}:
# rvest кладёт "" для НЕотмеченных чекбоксов
if el.has_attr("checked"):
value = el.get("value", "on") # если нет value, браузеры шлют "on"
else:
value = ""
# --- TEXT, NUMBER, HIDDEN, и проч. INPUT -------------------
else:
value = el.get("value", "")
payload[name] = value
# Перекрываем пользовательскими значениями
payload.update(overrides)
return payload
payload = build_payload(form, params)
payload
resp = sess.post(
settings_url,
data=payload,
headers={"Referer": "http://10.5.2.25/login"}, # <- критично
)
from urllib.parse import urlencode
print(urlencode(payload, doseq=True))
resp.raise_for_status()

View File

@ -85,6 +85,7 @@ def artsurvey_session(
console.print( console.print(
f"[green]✔[/green] Logged in as [cyan]{username}[/cyan] to [link={base_url.rstrip('/')}][blue underline]<{base_url.rstrip('/')}>" f"[green]✔[/green] Logged in as [cyan]{username}[/cyan] to [link={base_url.rstrip('/')}][blue underline]<{base_url.rstrip('/')}>"
) )
session.base_url = base_url
return session return session
def default_artsurvey_settings() -> dict[str, str]: def default_artsurvey_settings() -> dict[str, str]:
@ -118,7 +119,7 @@ def default_artsurvey_settings() -> dict[str, str]:
"class_startswith": "", "class_startswith": "",
"cname_contains": "", "cname_contains": "",
"category": "", "category": "",
"exclude_category": "", "exclude_category": ""
# "category_unclassified": "", # "category_unclassified": "",
# "gaia_primary": "", # "gaia_primary": "",
# "allwise_primary": "", # "allwise_primary": "",
@ -132,10 +133,31 @@ def default_artsurvey_settings() -> dict[str, str]:
# "circle_rmax_deg": "" # "circle_rmax_deg": ""
} }
def get_artsurvey_cat(session: requests.Session, survey_name: str = "S1-5v12345.12", **kwargs): def get_artsurvey_cat(
# 2. Получаем форму настроек session: requests.Session,
survey_name: str = "S1-5v12345.12",
**kwargs
) -> pd.DataFrame:
"""
Получить ARTSurvey каталог с фильтрами.
Parameters
----------
session : requests.Session
Авторизованная сессия (через artsurvey_session()).
survey_name : str
Название обзора (как указано в форме, например "S1-5v12345.12").
**kwargs : dict
Перекрывающие значения параметров формы.
Returns
-------
pd.DataFrame
"""
base_url = session.base_url
# 1. Переход на страницу настроек
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress: with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
progress.add_task(description="Обновление фильтров обзора", total=None) progress.add_task(description="Обновление фильтров ARTSurvey", total=None)
settings_url = f"{base_url.rstrip('/')}/artsurvey" settings_url = f"{base_url.rstrip('/')}/artsurvey"
resp = session.get(settings_url) resp = session.get(settings_url)
resp.raise_for_status() resp.raise_for_status()
@ -143,58 +165,55 @@ def get_artsurvey_cat(session: requests.Session, survey_name: str = "S1-5v12345.
form = soup.find("form") form = soup.find("form")
if not form: if not form:
console.print("[red]❌ Не найдена форма фильтров на странице artsurvey.[/red]") console.print("[red]❌ Форма фильтров не найдена.[/red]")
return pd.DataFrame() return pd.DataFrame()
# 3. Собираем параметры формы # 2. Получаем список доступных обзоров
select = form.find("select", {"id": "id_survey"})
if not select:
console.print("[red]❌ Список обзоров (id_survey) не найден.[/red]")
return pd.DataFrame()
survey_options = {
option.text.strip(): option.get("value")
for option in select.find_all("option")
if option.get("value")
}
if survey_name not in survey_options:
console.print(f"[red]❌ Обзор '{survey_name}' не найден. Доступные: {list(survey_options.keys())}[/red]")
return pd.DataFrame()
# 3. Объединяем параметры
params = {**default_artsurvey_settings(), **kwargs}
params["survey"] = survey_options[survey_name]
# 4. Подготовка payload (фильтры, которые реально присутствуют в форме)
payload = {} payload = {}
for tag in form.find_all(["input", "select"]):
name = tag.get("name")
if name and name in params:
payload[name] = params[name]
# 3.1 input-поля
for input_tag in form.find_all("input"):
name = input_tag.get("name")
if not name:
continue
payload[name] = input_tag.get("value", "")
# 3.2 select-поля
for select_tag in form.find_all("select"):
name = select_tag.get("name")
if not name:
continue
options = select_tag.find_all("option")
selected = None
for option in options:
if option.text.strip() == survey_name:
selected = option.get("value")
break
if selected:
payload[name] = selected
# 3.3 пользовательские аргументы (приоритетные)
for k, v in kwargs.items():
payload[k] = v
# 4. Отправляем фильтры
action = form.get("action") or settings_url action = form.get("action") or settings_url
full_action = action if action.startswith("http") else requests.compat.urljoin(settings_url, action) full_action = action if action.startswith("http") else requests.compat.urljoin(settings_url, action)
resp = session.post(full_action, data=payload, headers={"Referer": settings_url}) resp = session.post(full_action, data=payload, headers={"Referer": settings_url})
resp.raise_for_status() resp.raise_for_status()
# 5. Загружаем CSV # 5. Скачиваем CSV
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress: with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
progress.add_task(description="Загрузка каталога ART-Survey (~10-20 секунд)", total=None) progress.add_task(description="Загрузка ARTSurvey каталога", total=None)
csv_url = f"{base_url.rstrip('/')}/artsurvey/csv/all" csv_url = f"{base_url.rstrip('/')}/artsurvey/csv/all"
resp = session.get(csv_url) resp = session.get(csv_url)
resp.raise_for_status() resp.raise_for_status()
df = pd.read_csv(pd.compat.StringIO(resp.text), na_values=["", "None"]) df = pd.read_csv(StringIO(resp.text), na_values=["", "None"])
# 6. Обработка
if df.empty: if df.empty:
console.print("[red]Каталог пуст. Возможно, фильтры слишком строгие.[/red]") console.print("[red]Каталог пуст. Проверьте фильтры.[/red]")
return df return df
if "ra" in df.columns and "dec" in df.columns: if {"ra", "dec"}.issubset(df.columns):
coords = SkyCoord(ra=df["ra"].values * u.deg, dec=df["dec"].values * u.deg, frame="icrs") df["skycoord"] = SkyCoord(df["ra"] * u.deg, df["dec"] * u.deg, frame="icrs")
df["skycoord"] = coords
return df return df

155
src/srgweb/publications.py Normal file
View File

@ -0,0 +1,155 @@
"""Utility functions to scrape SRG publication & telegram pages.
Dependencies
------------
- requests
- beautifulsoup4
- pandas
- rich (optional, for nice progress)
Example
-------
>>> from srg_publications import (
... parse_srg_paper_links,
... get_df_from_srg_papers,
... get_df_from_srg_telegrams,
... )
>>> df_papers = get_df_from_srg_papers()
>>> df_tg = get_df_from_srg_telegrams()
"""
from __future__ import annotations
import re
from typing import List, Dict
import requests
from bs4 import BeautifulSoup, Tag
import pandas as pd
from rich.progress import Progress, SpinnerColumn, TextColumn
BASE_SITE = "https://www.srg.cosmos.ru"
PUBLICATIONS_URL = f"{BASE_SITE}/publications/"
TELEGRAMS_ATEL_URL = f"{BASE_SITE}/publications/telegrams/atel"
TELEGRAMS_GCN_URL = f"{BASE_SITE}/publications/telegrams/gcn"
def clear_arxiv_link(arxiv_abs_link: str | None) -> str | None:
"""Normalise an arXiv *abs* URL into canonical `<id>` form.
Examples
--------
>>> clear_arxiv_link("https://arxiv.org/abs/2301.01234v2")
'2301.01234'
>>> clear_arxiv_link("arXiv:2209.00001v1")
'2209.00001'
"""
if not arxiv_abs_link:
return None
# remove version suffix like v2
cleaned = re.sub(r"v\d+$", "", arxiv_abs_link.strip())
# remove protocol and prefix
cleaned = re.sub(r"https?://arxiv\.org/abs/", "", cleaned)
cleaned = cleaned.replace("arXiv:", "")
return cleaned
# ----------------------------------------------------------------------------
# 1. Publication list helpers
# ----------------------------------------------------------------------------
def _session_for(url: str) -> requests.Session:
sess = requests.Session()
sess.headers.update({
"User-Agent": "srgweb, Python package (uskov@cosmos.ru)"
})
# prime cookies
sess.get(url)
return sess
def parse_srg_paper_links(page_url: str) -> dict[str, str | None]:
"""Parse individual SRG paper page and return arXiv + ADS links.
Parameters
----------
page_url : str
Full URL to SRG paper detail page.
Returns
-------
dict with keys ``srg_arxiv_url`` and ``srg_bibcode``.
Missing links are ``None``.
"""
sess = _session_for(page_url)
soup = BeautifulSoup(sess.get(page_url).text, "html.parser")
paper_links = [a.get("href") for a in soup.select("li a[href]")]
arxiv_link = next((l for l in paper_links if "arxiv.org/abs" in l), None)
adsabs_link = next((l for l in paper_links if "ui.adsabs.harvard.edu" in l), None)
return {
"srg_arxiv_url": arxiv_link,
"srg_bibcode": adsabs_link,
}
def get_srg_publications(progress: bool = True) -> pd.DataFrame:
"""Scrape the main publications page and return a DataFrame.
Columns
-------
title_srg : Publication title shown on SRG site.
page_srg : Full SRG page URL.
srg_arxiv : Canonical arXiv ID (if any).
srg_bibcode : ADS bibcode (if any).
srg_arxiv_url : original arXiv URL.
"""
sess = _session_for(PUBLICATIONS_URL)
soup = BeautifulSoup(sess.get(PUBLICATIONS_URL).text, "html.parser")
# Remove buttons that interfere with finding <a>
for btn in soup.select(".btn"):
btn.decompose()
anchors = soup.select("tbody a")
titles: List[str] = [a.select_one("strong").text.strip() for a in anchors]
page_urls: List[str] = [BASE_SITE + a.get("href") for a in anchors]
iterator = zip(titles, page_urls)
records: List[Dict[str, str | None]] = []
if progress:
bar = Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True)
task_desc = "Parsing arXiv/ADS links"
with bar:
t = bar.add_task(task_desc, total=len(titles))
for title, link in iterator:
links = parse_srg_paper_links(link)
rec = {
"title_srg": title,
"page_srg": link,
**links,
}
rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"])
if rec["srg_bibcode"]:
rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"])
records.append(rec)
bar.update(t, advance=1)
else:
for title, link in iterator:
links = parse_srg_paper_links(link)
rec = {
"title_srg": title,
"page_srg": link,
**links,
}
rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"])
if rec["srg_bibcode"]:
rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"])
records.append(rec)
return pd.DataFrame.from_records(records)