first version
This commit is contained in:
parent
cac9701e3b
commit
1c66157b68
2
.gitignore
vendored
2
.gitignore
vendored
@ -2,3 +2,5 @@
|
||||
*.pyo
|
||||
__pycache__/
|
||||
.quarto
|
||||
reference/
|
||||
tests/
|
93
examples.py
93
examples.py
@ -18,3 +18,96 @@ baskets = list_baskets(sess)
|
||||
# download basket ART-XC agns
|
||||
df_basket = get_basket(sess, basket='ART-XC agns')
|
||||
|
||||
|
||||
from srgweb.artsurvey import (
|
||||
artsurvey_session,
|
||||
get_artsurvey_cat
|
||||
)
|
||||
import keyring
|
||||
sess = artsurvey_session(
|
||||
"uskov",
|
||||
keyring.get_password("SRG_ARTSURVEY", ""),
|
||||
base_url="http://10.5.2.25/"
|
||||
)
|
||||
|
||||
base_url = sess.base_url
|
||||
settings_url = f"{base_url.rstrip('/')}/artsurvey"
|
||||
|
||||
resp = sess.get(settings_url)
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
form = soup.find("form")
|
||||
|
||||
|
||||
select = form.find("select", {"id": "id_survey"})
|
||||
|
||||
|
||||
survey_options = {
|
||||
option.text.strip(): option.get("value")
|
||||
for option in select.find_all("option")
|
||||
if option.get("value")
|
||||
}
|
||||
|
||||
|
||||
# 3. Объединяем параметры
|
||||
params = {**default_artsurvey_settings(), **{}}
|
||||
params["survey"] = survey_options['S1-5v12345.12']
|
||||
|
||||
# 4. Подготовка payload (фильтры, которые реально присутствуют в форме)
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
import requests
|
||||
|
||||
def build_payload(form: Tag, overrides: dict[str, str]) -> dict[str, str]:
|
||||
"""
|
||||
Собирает payload **из всех** полей формы + перекрывает overrides.
|
||||
Поведение 1-в-1 как rvest::html_form_set().
|
||||
"""
|
||||
payload: dict[str, str] = {}
|
||||
|
||||
for el in form.find_all(["input", "select", "textarea"]):
|
||||
name = el.get("name")
|
||||
if not name:
|
||||
continue
|
||||
|
||||
tag_type = el.get("type", "").lower()
|
||||
value: str
|
||||
|
||||
# --- SELECT -------------------------------------------------
|
||||
if el.name == "select":
|
||||
selected = el.find("option", selected=True)
|
||||
value = selected.get("value") if selected else el.find("option").get("value", "")
|
||||
|
||||
# --- CHECKBOX / RADIO --------------------------------------
|
||||
elif tag_type in {"checkbox", "radio"}:
|
||||
# rvest кладёт "" для НЕотмеченных чекбоксов
|
||||
if el.has_attr("checked"):
|
||||
value = el.get("value", "on") # если нет value, браузеры шлют "on"
|
||||
else:
|
||||
value = ""
|
||||
|
||||
# --- TEXT, NUMBER, HIDDEN, и проч. INPUT -------------------
|
||||
else:
|
||||
value = el.get("value", "")
|
||||
|
||||
payload[name] = value
|
||||
|
||||
# Перекрываем пользовательскими значениями
|
||||
payload.update(overrides)
|
||||
return payload
|
||||
|
||||
payload = build_payload(form, params)
|
||||
payload
|
||||
|
||||
|
||||
|
||||
resp = sess.post(
|
||||
settings_url,
|
||||
data=payload,
|
||||
headers={"Referer": "http://10.5.2.25/login"}, # <- критично
|
||||
)
|
||||
|
||||
from urllib.parse import urlencode
|
||||
print(urlencode(payload, doseq=True))
|
||||
|
||||
resp.raise_for_status()
|
@ -85,6 +85,7 @@ def artsurvey_session(
|
||||
console.print(
|
||||
f"[green]✔[/green] Logged in as [cyan]{username}[/cyan] to [link={base_url.rstrip('/')}][blue underline]<{base_url.rstrip('/')}>"
|
||||
)
|
||||
session.base_url = base_url
|
||||
return session
|
||||
|
||||
def default_artsurvey_settings() -> dict[str, str]:
|
||||
@ -118,7 +119,7 @@ def default_artsurvey_settings() -> dict[str, str]:
|
||||
"class_startswith": "",
|
||||
"cname_contains": "",
|
||||
"category": "",
|
||||
"exclude_category": "",
|
||||
"exclude_category": ""
|
||||
# "category_unclassified": "",
|
||||
# "gaia_primary": "",
|
||||
# "allwise_primary": "",
|
||||
@ -132,10 +133,31 @@ def default_artsurvey_settings() -> dict[str, str]:
|
||||
# "circle_rmax_deg": ""
|
||||
}
|
||||
|
||||
def get_artsurvey_cat(session: requests.Session, survey_name: str = "S1-5v12345.12", **kwargs):
|
||||
# 2. Получаем форму настроек
|
||||
def get_artsurvey_cat(
|
||||
session: requests.Session,
|
||||
survey_name: str = "S1-5v12345.12",
|
||||
**kwargs
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Получить ARTSurvey каталог с фильтрами.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
session : requests.Session
|
||||
Авторизованная сессия (через artsurvey_session()).
|
||||
survey_name : str
|
||||
Название обзора (как указано в форме, например "S1-5v12345.12").
|
||||
**kwargs : dict
|
||||
Перекрывающие значения параметров формы.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
"""
|
||||
base_url = session.base_url
|
||||
# 1. Переход на страницу настроек
|
||||
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
|
||||
progress.add_task(description="Обновление фильтров обзора", total=None)
|
||||
progress.add_task(description="Обновление фильтров ARTSurvey", total=None)
|
||||
settings_url = f"{base_url.rstrip('/')}/artsurvey"
|
||||
resp = session.get(settings_url)
|
||||
resp.raise_for_status()
|
||||
@ -143,58 +165,55 @@ def get_artsurvey_cat(session: requests.Session, survey_name: str = "S1-5v12345.
|
||||
|
||||
form = soup.find("form")
|
||||
if not form:
|
||||
console.print("[red]❌ Не найдена форма фильтров на странице artsurvey.[/red]")
|
||||
console.print("[red]❌ Форма фильтров не найдена.[/red]")
|
||||
return pd.DataFrame()
|
||||
|
||||
# 3. Собираем параметры формы
|
||||
# 2. Получаем список доступных обзоров
|
||||
select = form.find("select", {"id": "id_survey"})
|
||||
if not select:
|
||||
console.print("[red]❌ Список обзоров (id_survey) не найден.[/red]")
|
||||
return pd.DataFrame()
|
||||
|
||||
survey_options = {
|
||||
option.text.strip(): option.get("value")
|
||||
for option in select.find_all("option")
|
||||
if option.get("value")
|
||||
}
|
||||
|
||||
if survey_name not in survey_options:
|
||||
console.print(f"[red]❌ Обзор '{survey_name}' не найден. Доступные: {list(survey_options.keys())}[/red]")
|
||||
return pd.DataFrame()
|
||||
|
||||
# 3. Объединяем параметры
|
||||
params = {**default_artsurvey_settings(), **kwargs}
|
||||
params["survey"] = survey_options[survey_name]
|
||||
|
||||
# 4. Подготовка payload (фильтры, которые реально присутствуют в форме)
|
||||
payload = {}
|
||||
for tag in form.find_all(["input", "select"]):
|
||||
name = tag.get("name")
|
||||
if name and name in params:
|
||||
payload[name] = params[name]
|
||||
|
||||
# 3.1 input-поля
|
||||
for input_tag in form.find_all("input"):
|
||||
name = input_tag.get("name")
|
||||
if not name:
|
||||
continue
|
||||
payload[name] = input_tag.get("value", "")
|
||||
|
||||
# 3.2 select-поля
|
||||
for select_tag in form.find_all("select"):
|
||||
name = select_tag.get("name")
|
||||
if not name:
|
||||
continue
|
||||
options = select_tag.find_all("option")
|
||||
selected = None
|
||||
for option in options:
|
||||
if option.text.strip() == survey_name:
|
||||
selected = option.get("value")
|
||||
break
|
||||
if selected:
|
||||
payload[name] = selected
|
||||
|
||||
# 3.3 пользовательские аргументы (приоритетные)
|
||||
for k, v in kwargs.items():
|
||||
payload[k] = v
|
||||
|
||||
# 4. Отправляем фильтры
|
||||
action = form.get("action") or settings_url
|
||||
full_action = action if action.startswith("http") else requests.compat.urljoin(settings_url, action)
|
||||
|
||||
resp = session.post(full_action, data=payload, headers={"Referer": settings_url})
|
||||
resp.raise_for_status()
|
||||
|
||||
# 5. Загружаем CSV
|
||||
# 5. Скачиваем CSV
|
||||
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
|
||||
progress.add_task(description="Загрузка каталога ART-Survey (~10-20 секунд)", total=None)
|
||||
progress.add_task(description="Загрузка ARTSurvey каталога", total=None)
|
||||
csv_url = f"{base_url.rstrip('/')}/artsurvey/csv/all"
|
||||
resp = session.get(csv_url)
|
||||
resp.raise_for_status()
|
||||
df = pd.read_csv(pd.compat.StringIO(resp.text), na_values=["", "None"])
|
||||
df = pd.read_csv(StringIO(resp.text), na_values=["", "None"])
|
||||
|
||||
# 6. Обработка
|
||||
if df.empty:
|
||||
console.print("[red]Каталог пуст. Возможно, фильтры слишком строгие.[/red]")
|
||||
console.print("[red]Каталог пуст. Проверьте фильтры.[/red]")
|
||||
return df
|
||||
|
||||
if "ra" in df.columns and "dec" in df.columns:
|
||||
coords = SkyCoord(ra=df["ra"].values * u.deg, dec=df["dec"].values * u.deg, frame="icrs")
|
||||
df["skycoord"] = coords
|
||||
if {"ra", "dec"}.issubset(df.columns):
|
||||
df["skycoord"] = SkyCoord(df["ra"] * u.deg, df["dec"] * u.deg, frame="icrs")
|
||||
|
||||
return df
|
||||
|
155
src/srgweb/publications.py
Normal file
155
src/srgweb/publications.py
Normal file
@ -0,0 +1,155 @@
|
||||
"""Utility functions to scrape SRG publication & telegram pages.
|
||||
|
||||
Dependencies
|
||||
------------
|
||||
- requests
|
||||
- beautifulsoup4
|
||||
- pandas
|
||||
- rich (optional, for nice progress)
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> from srg_publications import (
|
||||
... parse_srg_paper_links,
|
||||
... get_df_from_srg_papers,
|
||||
... get_df_from_srg_telegrams,
|
||||
... )
|
||||
>>> df_papers = get_df_from_srg_papers()
|
||||
>>> df_tg = get_df_from_srg_telegrams()
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import List, Dict
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
import pandas as pd
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||
|
||||
BASE_SITE = "https://www.srg.cosmos.ru"
|
||||
PUBLICATIONS_URL = f"{BASE_SITE}/publications/"
|
||||
TELEGRAMS_ATEL_URL = f"{BASE_SITE}/publications/telegrams/atel"
|
||||
TELEGRAMS_GCN_URL = f"{BASE_SITE}/publications/telegrams/gcn"
|
||||
|
||||
|
||||
def clear_arxiv_link(arxiv_abs_link: str | None) -> str | None:
|
||||
"""Normalise an arXiv *abs* URL into canonical `<id>` form.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> clear_arxiv_link("https://arxiv.org/abs/2301.01234v2")
|
||||
'2301.01234'
|
||||
>>> clear_arxiv_link("arXiv:2209.00001v1")
|
||||
'2209.00001'
|
||||
"""
|
||||
if not arxiv_abs_link:
|
||||
return None
|
||||
# remove version suffix like v2
|
||||
cleaned = re.sub(r"v\d+$", "", arxiv_abs_link.strip())
|
||||
# remove protocol and prefix
|
||||
cleaned = re.sub(r"https?://arxiv\.org/abs/", "", cleaned)
|
||||
cleaned = cleaned.replace("arXiv:", "")
|
||||
return cleaned
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# 1. Publication list helpers
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
def _session_for(url: str) -> requests.Session:
|
||||
sess = requests.Session()
|
||||
sess.headers.update({
|
||||
"User-Agent": "srgweb, Python package (uskov@cosmos.ru)"
|
||||
})
|
||||
# prime cookies
|
||||
sess.get(url)
|
||||
return sess
|
||||
|
||||
|
||||
def parse_srg_paper_links(page_url: str) -> dict[str, str | None]:
|
||||
"""Parse individual SRG paper page and return arXiv + ADS links.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
page_url : str
|
||||
Full URL to SRG paper detail page.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict with keys ``srg_arxiv_url`` and ``srg_bibcode``.
|
||||
Missing links are ``None``.
|
||||
"""
|
||||
sess = _session_for(page_url)
|
||||
soup = BeautifulSoup(sess.get(page_url).text, "html.parser")
|
||||
|
||||
paper_links = [a.get("href") for a in soup.select("li a[href]")]
|
||||
arxiv_link = next((l for l in paper_links if "arxiv.org/abs" in l), None)
|
||||
adsabs_link = next((l for l in paper_links if "ui.adsabs.harvard.edu" in l), None)
|
||||
|
||||
return {
|
||||
"srg_arxiv_url": arxiv_link,
|
||||
"srg_bibcode": adsabs_link,
|
||||
}
|
||||
|
||||
|
||||
def get_srg_publications(progress: bool = True) -> pd.DataFrame:
|
||||
"""Scrape the main publications page and return a DataFrame.
|
||||
|
||||
Columns
|
||||
-------
|
||||
title_srg : Publication title shown on SRG site.
|
||||
page_srg : Full SRG page URL.
|
||||
srg_arxiv : Canonical arXiv ID (if any).
|
||||
srg_bibcode : ADS bibcode (if any).
|
||||
srg_arxiv_url : original arXiv URL.
|
||||
"""
|
||||
sess = _session_for(PUBLICATIONS_URL)
|
||||
|
||||
soup = BeautifulSoup(sess.get(PUBLICATIONS_URL).text, "html.parser")
|
||||
|
||||
# Remove buttons that interfere with finding <a>
|
||||
for btn in soup.select(".btn"):
|
||||
btn.decompose()
|
||||
|
||||
anchors = soup.select("tbody a")
|
||||
|
||||
titles: List[str] = [a.select_one("strong").text.strip() for a in anchors]
|
||||
page_urls: List[str] = [BASE_SITE + a.get("href") for a in anchors]
|
||||
|
||||
iterator = zip(titles, page_urls)
|
||||
|
||||
records: List[Dict[str, str | None]] = []
|
||||
|
||||
if progress:
|
||||
bar = Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True)
|
||||
task_desc = "Parsing arXiv/ADS links"
|
||||
with bar:
|
||||
t = bar.add_task(task_desc, total=len(titles))
|
||||
for title, link in iterator:
|
||||
links = parse_srg_paper_links(link)
|
||||
rec = {
|
||||
"title_srg": title,
|
||||
"page_srg": link,
|
||||
**links,
|
||||
}
|
||||
rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"])
|
||||
if rec["srg_bibcode"]:
|
||||
rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"])
|
||||
records.append(rec)
|
||||
bar.update(t, advance=1)
|
||||
else:
|
||||
for title, link in iterator:
|
||||
links = parse_srg_paper_links(link)
|
||||
rec = {
|
||||
"title_srg": title,
|
||||
"page_srg": link,
|
||||
**links,
|
||||
}
|
||||
rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"])
|
||||
if rec["srg_bibcode"]:
|
||||
rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"])
|
||||
records.append(rec)
|
||||
|
||||
return pd.DataFrame.from_records(records)
|
||||
|
Loading…
x
Reference in New Issue
Block a user