first version
This commit is contained in:
parent
cac9701e3b
commit
1c66157b68
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,4 +1,6 @@
|
|||||||
*.pyc
|
*.pyc
|
||||||
*.pyo
|
*.pyo
|
||||||
__pycache__/
|
__pycache__/
|
||||||
.quarto
|
.quarto
|
||||||
|
reference/
|
||||||
|
tests/
|
93
examples.py
93
examples.py
@ -18,3 +18,96 @@ baskets = list_baskets(sess)
|
|||||||
# download basket ART-XC agns
|
# download basket ART-XC agns
|
||||||
df_basket = get_basket(sess, basket='ART-XC agns')
|
df_basket = get_basket(sess, basket='ART-XC agns')
|
||||||
|
|
||||||
|
|
||||||
|
from srgweb.artsurvey import (
|
||||||
|
artsurvey_session,
|
||||||
|
get_artsurvey_cat
|
||||||
|
)
|
||||||
|
import keyring
|
||||||
|
sess = artsurvey_session(
|
||||||
|
"uskov",
|
||||||
|
keyring.get_password("SRG_ARTSURVEY", ""),
|
||||||
|
base_url="http://10.5.2.25/"
|
||||||
|
)
|
||||||
|
|
||||||
|
base_url = sess.base_url
|
||||||
|
settings_url = f"{base_url.rstrip('/')}/artsurvey"
|
||||||
|
|
||||||
|
resp = sess.get(settings_url)
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
form = soup.find("form")
|
||||||
|
|
||||||
|
|
||||||
|
select = form.find("select", {"id": "id_survey"})
|
||||||
|
|
||||||
|
|
||||||
|
survey_options = {
|
||||||
|
option.text.strip(): option.get("value")
|
||||||
|
for option in select.find_all("option")
|
||||||
|
if option.get("value")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# 3. Объединяем параметры
|
||||||
|
params = {**default_artsurvey_settings(), **{}}
|
||||||
|
params["survey"] = survey_options['S1-5v12345.12']
|
||||||
|
|
||||||
|
# 4. Подготовка payload (фильтры, которые реально присутствуют в форме)
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def build_payload(form: Tag, overrides: dict[str, str]) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Собирает payload **из всех** полей формы + перекрывает overrides.
|
||||||
|
Поведение 1-в-1 как rvest::html_form_set().
|
||||||
|
"""
|
||||||
|
payload: dict[str, str] = {}
|
||||||
|
|
||||||
|
for el in form.find_all(["input", "select", "textarea"]):
|
||||||
|
name = el.get("name")
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tag_type = el.get("type", "").lower()
|
||||||
|
value: str
|
||||||
|
|
||||||
|
# --- SELECT -------------------------------------------------
|
||||||
|
if el.name == "select":
|
||||||
|
selected = el.find("option", selected=True)
|
||||||
|
value = selected.get("value") if selected else el.find("option").get("value", "")
|
||||||
|
|
||||||
|
# --- CHECKBOX / RADIO --------------------------------------
|
||||||
|
elif tag_type in {"checkbox", "radio"}:
|
||||||
|
# rvest кладёт "" для НЕотмеченных чекбоксов
|
||||||
|
if el.has_attr("checked"):
|
||||||
|
value = el.get("value", "on") # если нет value, браузеры шлют "on"
|
||||||
|
else:
|
||||||
|
value = ""
|
||||||
|
|
||||||
|
# --- TEXT, NUMBER, HIDDEN, и проч. INPUT -------------------
|
||||||
|
else:
|
||||||
|
value = el.get("value", "")
|
||||||
|
|
||||||
|
payload[name] = value
|
||||||
|
|
||||||
|
# Перекрываем пользовательскими значениями
|
||||||
|
payload.update(overrides)
|
||||||
|
return payload
|
||||||
|
|
||||||
|
payload = build_payload(form, params)
|
||||||
|
payload
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
resp = sess.post(
|
||||||
|
settings_url,
|
||||||
|
data=payload,
|
||||||
|
headers={"Referer": "http://10.5.2.25/login"}, # <- критично
|
||||||
|
)
|
||||||
|
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
print(urlencode(payload, doseq=True))
|
||||||
|
|
||||||
|
resp.raise_for_status()
|
@ -85,6 +85,7 @@ def artsurvey_session(
|
|||||||
console.print(
|
console.print(
|
||||||
f"[green]✔[/green] Logged in as [cyan]{username}[/cyan] to [link={base_url.rstrip('/')}][blue underline]<{base_url.rstrip('/')}>"
|
f"[green]✔[/green] Logged in as [cyan]{username}[/cyan] to [link={base_url.rstrip('/')}][blue underline]<{base_url.rstrip('/')}>"
|
||||||
)
|
)
|
||||||
|
session.base_url = base_url
|
||||||
return session
|
return session
|
||||||
|
|
||||||
def default_artsurvey_settings() -> dict[str, str]:
|
def default_artsurvey_settings() -> dict[str, str]:
|
||||||
@ -118,7 +119,7 @@ def default_artsurvey_settings() -> dict[str, str]:
|
|||||||
"class_startswith": "",
|
"class_startswith": "",
|
||||||
"cname_contains": "",
|
"cname_contains": "",
|
||||||
"category": "",
|
"category": "",
|
||||||
"exclude_category": "",
|
"exclude_category": ""
|
||||||
# "category_unclassified": "",
|
# "category_unclassified": "",
|
||||||
# "gaia_primary": "",
|
# "gaia_primary": "",
|
||||||
# "allwise_primary": "",
|
# "allwise_primary": "",
|
||||||
@ -132,10 +133,31 @@ def default_artsurvey_settings() -> dict[str, str]:
|
|||||||
# "circle_rmax_deg": ""
|
# "circle_rmax_deg": ""
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_artsurvey_cat(session: requests.Session, survey_name: str = "S1-5v12345.12", **kwargs):
|
def get_artsurvey_cat(
|
||||||
# 2. Получаем форму настроек
|
session: requests.Session,
|
||||||
|
survey_name: str = "S1-5v12345.12",
|
||||||
|
**kwargs
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Получить ARTSurvey каталог с фильтрами.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
session : requests.Session
|
||||||
|
Авторизованная сессия (через artsurvey_session()).
|
||||||
|
survey_name : str
|
||||||
|
Название обзора (как указано в форме, например "S1-5v12345.12").
|
||||||
|
**kwargs : dict
|
||||||
|
Перекрывающие значения параметров формы.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pd.DataFrame
|
||||||
|
"""
|
||||||
|
base_url = session.base_url
|
||||||
|
# 1. Переход на страницу настроек
|
||||||
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
|
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
|
||||||
progress.add_task(description="Обновление фильтров обзора", total=None)
|
progress.add_task(description="Обновление фильтров ARTSurvey", total=None)
|
||||||
settings_url = f"{base_url.rstrip('/')}/artsurvey"
|
settings_url = f"{base_url.rstrip('/')}/artsurvey"
|
||||||
resp = session.get(settings_url)
|
resp = session.get(settings_url)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
@ -143,58 +165,55 @@ def get_artsurvey_cat(session: requests.Session, survey_name: str = "S1-5v12345.
|
|||||||
|
|
||||||
form = soup.find("form")
|
form = soup.find("form")
|
||||||
if not form:
|
if not form:
|
||||||
console.print("[red]❌ Не найдена форма фильтров на странице artsurvey.[/red]")
|
console.print("[red]❌ Форма фильтров не найдена.[/red]")
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
# 3. Собираем параметры формы
|
# 2. Получаем список доступных обзоров
|
||||||
|
select = form.find("select", {"id": "id_survey"})
|
||||||
|
if not select:
|
||||||
|
console.print("[red]❌ Список обзоров (id_survey) не найден.[/red]")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
survey_options = {
|
||||||
|
option.text.strip(): option.get("value")
|
||||||
|
for option in select.find_all("option")
|
||||||
|
if option.get("value")
|
||||||
|
}
|
||||||
|
|
||||||
|
if survey_name not in survey_options:
|
||||||
|
console.print(f"[red]❌ Обзор '{survey_name}' не найден. Доступные: {list(survey_options.keys())}[/red]")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
# 3. Объединяем параметры
|
||||||
|
params = {**default_artsurvey_settings(), **kwargs}
|
||||||
|
params["survey"] = survey_options[survey_name]
|
||||||
|
|
||||||
|
# 4. Подготовка payload (фильтры, которые реально присутствуют в форме)
|
||||||
payload = {}
|
payload = {}
|
||||||
|
for tag in form.find_all(["input", "select"]):
|
||||||
|
name = tag.get("name")
|
||||||
|
if name and name in params:
|
||||||
|
payload[name] = params[name]
|
||||||
|
|
||||||
# 3.1 input-поля
|
|
||||||
for input_tag in form.find_all("input"):
|
|
||||||
name = input_tag.get("name")
|
|
||||||
if not name:
|
|
||||||
continue
|
|
||||||
payload[name] = input_tag.get("value", "")
|
|
||||||
|
|
||||||
# 3.2 select-поля
|
|
||||||
for select_tag in form.find_all("select"):
|
|
||||||
name = select_tag.get("name")
|
|
||||||
if not name:
|
|
||||||
continue
|
|
||||||
options = select_tag.find_all("option")
|
|
||||||
selected = None
|
|
||||||
for option in options:
|
|
||||||
if option.text.strip() == survey_name:
|
|
||||||
selected = option.get("value")
|
|
||||||
break
|
|
||||||
if selected:
|
|
||||||
payload[name] = selected
|
|
||||||
|
|
||||||
# 3.3 пользовательские аргументы (приоритетные)
|
|
||||||
for k, v in kwargs.items():
|
|
||||||
payload[k] = v
|
|
||||||
|
|
||||||
# 4. Отправляем фильтры
|
|
||||||
action = form.get("action") or settings_url
|
action = form.get("action") or settings_url
|
||||||
full_action = action if action.startswith("http") else requests.compat.urljoin(settings_url, action)
|
full_action = action if action.startswith("http") else requests.compat.urljoin(settings_url, action)
|
||||||
|
|
||||||
resp = session.post(full_action, data=payload, headers={"Referer": settings_url})
|
resp = session.post(full_action, data=payload, headers={"Referer": settings_url})
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
# 5. Загружаем CSV
|
# 5. Скачиваем CSV
|
||||||
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
|
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
|
||||||
progress.add_task(description="Загрузка каталога ART-Survey (~10-20 секунд)", total=None)
|
progress.add_task(description="Загрузка ARTSurvey каталога", total=None)
|
||||||
csv_url = f"{base_url.rstrip('/')}/artsurvey/csv/all"
|
csv_url = f"{base_url.rstrip('/')}/artsurvey/csv/all"
|
||||||
resp = session.get(csv_url)
|
resp = session.get(csv_url)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
df = pd.read_csv(pd.compat.StringIO(resp.text), na_values=["", "None"])
|
df = pd.read_csv(StringIO(resp.text), na_values=["", "None"])
|
||||||
|
|
||||||
# 6. Обработка
|
|
||||||
if df.empty:
|
if df.empty:
|
||||||
console.print("[red]Каталог пуст. Возможно, фильтры слишком строгие.[/red]")
|
console.print("[red]Каталог пуст. Проверьте фильтры.[/red]")
|
||||||
return df
|
return df
|
||||||
|
|
||||||
if "ra" in df.columns and "dec" in df.columns:
|
if {"ra", "dec"}.issubset(df.columns):
|
||||||
coords = SkyCoord(ra=df["ra"].values * u.deg, dec=df["dec"].values * u.deg, frame="icrs")
|
df["skycoord"] = SkyCoord(df["ra"] * u.deg, df["dec"] * u.deg, frame="icrs")
|
||||||
df["skycoord"] = coords
|
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
155
src/srgweb/publications.py
Normal file
155
src/srgweb/publications.py
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
"""Utility functions to scrape SRG publication & telegram pages.
|
||||||
|
|
||||||
|
Dependencies
|
||||||
|
------------
|
||||||
|
- requests
|
||||||
|
- beautifulsoup4
|
||||||
|
- pandas
|
||||||
|
- rich (optional, for nice progress)
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> from srg_publications import (
|
||||||
|
... parse_srg_paper_links,
|
||||||
|
... get_df_from_srg_papers,
|
||||||
|
... get_df_from_srg_telegrams,
|
||||||
|
... )
|
||||||
|
>>> df_papers = get_df_from_srg_papers()
|
||||||
|
>>> df_tg = get_df_from_srg_telegrams()
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
import pandas as pd
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||||
|
|
||||||
|
BASE_SITE = "https://www.srg.cosmos.ru"
|
||||||
|
PUBLICATIONS_URL = f"{BASE_SITE}/publications/"
|
||||||
|
TELEGRAMS_ATEL_URL = f"{BASE_SITE}/publications/telegrams/atel"
|
||||||
|
TELEGRAMS_GCN_URL = f"{BASE_SITE}/publications/telegrams/gcn"
|
||||||
|
|
||||||
|
|
||||||
|
def clear_arxiv_link(arxiv_abs_link: str | None) -> str | None:
|
||||||
|
"""Normalise an arXiv *abs* URL into canonical `<id>` form.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> clear_arxiv_link("https://arxiv.org/abs/2301.01234v2")
|
||||||
|
'2301.01234'
|
||||||
|
>>> clear_arxiv_link("arXiv:2209.00001v1")
|
||||||
|
'2209.00001'
|
||||||
|
"""
|
||||||
|
if not arxiv_abs_link:
|
||||||
|
return None
|
||||||
|
# remove version suffix like v2
|
||||||
|
cleaned = re.sub(r"v\d+$", "", arxiv_abs_link.strip())
|
||||||
|
# remove protocol and prefix
|
||||||
|
cleaned = re.sub(r"https?://arxiv\.org/abs/", "", cleaned)
|
||||||
|
cleaned = cleaned.replace("arXiv:", "")
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# 1. Publication list helpers
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _session_for(url: str) -> requests.Session:
|
||||||
|
sess = requests.Session()
|
||||||
|
sess.headers.update({
|
||||||
|
"User-Agent": "srgweb, Python package (uskov@cosmos.ru)"
|
||||||
|
})
|
||||||
|
# prime cookies
|
||||||
|
sess.get(url)
|
||||||
|
return sess
|
||||||
|
|
||||||
|
|
||||||
|
def parse_srg_paper_links(page_url: str) -> dict[str, str | None]:
|
||||||
|
"""Parse individual SRG paper page and return arXiv + ADS links.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
page_url : str
|
||||||
|
Full URL to SRG paper detail page.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict with keys ``srg_arxiv_url`` and ``srg_bibcode``.
|
||||||
|
Missing links are ``None``.
|
||||||
|
"""
|
||||||
|
sess = _session_for(page_url)
|
||||||
|
soup = BeautifulSoup(sess.get(page_url).text, "html.parser")
|
||||||
|
|
||||||
|
paper_links = [a.get("href") for a in soup.select("li a[href]")]
|
||||||
|
arxiv_link = next((l for l in paper_links if "arxiv.org/abs" in l), None)
|
||||||
|
adsabs_link = next((l for l in paper_links if "ui.adsabs.harvard.edu" in l), None)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"srg_arxiv_url": arxiv_link,
|
||||||
|
"srg_bibcode": adsabs_link,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_srg_publications(progress: bool = True) -> pd.DataFrame:
|
||||||
|
"""Scrape the main publications page and return a DataFrame.
|
||||||
|
|
||||||
|
Columns
|
||||||
|
-------
|
||||||
|
title_srg : Publication title shown on SRG site.
|
||||||
|
page_srg : Full SRG page URL.
|
||||||
|
srg_arxiv : Canonical arXiv ID (if any).
|
||||||
|
srg_bibcode : ADS bibcode (if any).
|
||||||
|
srg_arxiv_url : original arXiv URL.
|
||||||
|
"""
|
||||||
|
sess = _session_for(PUBLICATIONS_URL)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(sess.get(PUBLICATIONS_URL).text, "html.parser")
|
||||||
|
|
||||||
|
# Remove buttons that interfere with finding <a>
|
||||||
|
for btn in soup.select(".btn"):
|
||||||
|
btn.decompose()
|
||||||
|
|
||||||
|
anchors = soup.select("tbody a")
|
||||||
|
|
||||||
|
titles: List[str] = [a.select_one("strong").text.strip() for a in anchors]
|
||||||
|
page_urls: List[str] = [BASE_SITE + a.get("href") for a in anchors]
|
||||||
|
|
||||||
|
iterator = zip(titles, page_urls)
|
||||||
|
|
||||||
|
records: List[Dict[str, str | None]] = []
|
||||||
|
|
||||||
|
if progress:
|
||||||
|
bar = Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True)
|
||||||
|
task_desc = "Parsing arXiv/ADS links"
|
||||||
|
with bar:
|
||||||
|
t = bar.add_task(task_desc, total=len(titles))
|
||||||
|
for title, link in iterator:
|
||||||
|
links = parse_srg_paper_links(link)
|
||||||
|
rec = {
|
||||||
|
"title_srg": title,
|
||||||
|
"page_srg": link,
|
||||||
|
**links,
|
||||||
|
}
|
||||||
|
rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"])
|
||||||
|
if rec["srg_bibcode"]:
|
||||||
|
rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"])
|
||||||
|
records.append(rec)
|
||||||
|
bar.update(t, advance=1)
|
||||||
|
else:
|
||||||
|
for title, link in iterator:
|
||||||
|
links = parse_srg_paper_links(link)
|
||||||
|
rec = {
|
||||||
|
"title_srg": title,
|
||||||
|
"page_srg": link,
|
||||||
|
**links,
|
||||||
|
}
|
||||||
|
rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"])
|
||||||
|
if rec["srg_bibcode"]:
|
||||||
|
rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"])
|
||||||
|
records.append(rec)
|
||||||
|
|
||||||
|
return pd.DataFrame.from_records(records)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user