diff --git a/.gitignore b/.gitignore index 272276e..62b28c2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ *.pyc *.pyo __pycache__/ -.quarto \ No newline at end of file +.quarto +reference/ +tests/ \ No newline at end of file diff --git a/examples.py b/examples.py index 47bc66d..e452032 100644 --- a/examples.py +++ b/examples.py @@ -18,3 +18,96 @@ baskets = list_baskets(sess) # download basket ART-XC agns df_basket = get_basket(sess, basket='ART-XC agns') + +from srgweb.artsurvey import ( + artsurvey_session, + get_artsurvey_cat +) +import keyring +sess = artsurvey_session( + "uskov", + keyring.get_password("SRG_ARTSURVEY", ""), + base_url="http://10.5.2.25/" +) + +base_url = sess.base_url +settings_url = f"{base_url.rstrip('/')}/artsurvey" + +resp = sess.get(settings_url) +from bs4 import BeautifulSoup +soup = BeautifulSoup(resp.text, "html.parser") + +form = soup.find("form") + + +select = form.find("select", {"id": "id_survey"}) + + +survey_options = { + option.text.strip(): option.get("value") + for option in select.find_all("option") + if option.get("value") +} + + +# 3. Объединяем параметры +params = {**default_artsurvey_settings(), **{}} +params["survey"] = survey_options['S1-5v12345.12'] + +# 4. Подготовка payload (фильтры, которые реально присутствуют в форме) +from bs4 import BeautifulSoup, Tag +import requests + +def build_payload(form: Tag, overrides: dict[str, str]) -> dict[str, str]: + """ + Собирает payload **из всех** полей формы + перекрывает overrides. + Поведение 1-в-1 как rvest::html_form_set(). + """ + payload: dict[str, str] = {} + + for el in form.find_all(["input", "select", "textarea"]): + name = el.get("name") + if not name: + continue + + tag_type = el.get("type", "").lower() + value: str + + # --- SELECT ------------------------------------------------- + if el.name == "select": + selected = el.find("option", selected=True) + value = selected.get("value") if selected else el.find("option").get("value", "") + + # --- CHECKBOX / RADIO -------------------------------------- + elif tag_type in {"checkbox", "radio"}: + # rvest кладёт "" для НЕотмеченных чекбоксов + if el.has_attr("checked"): + value = el.get("value", "on") # если нет value, браузеры шлют "on" + else: + value = "" + + # --- TEXT, NUMBER, HIDDEN, и проч. INPUT ------------------- + else: + value = el.get("value", "") + + payload[name] = value + + # Перекрываем пользовательскими значениями + payload.update(overrides) + return payload + +payload = build_payload(form, params) +payload + + + +resp = sess.post( + settings_url, + data=payload, + headers={"Referer": "http://10.5.2.25/login"}, # <- критично +) + +from urllib.parse import urlencode +print(urlencode(payload, doseq=True)) + +resp.raise_for_status() \ No newline at end of file diff --git a/src/srgweb/artsurvey.py b/src/srgweb/artsurvey.py index 2b23966..74e8a31 100644 --- a/src/srgweb/artsurvey.py +++ b/src/srgweb/artsurvey.py @@ -85,6 +85,7 @@ def artsurvey_session( console.print( f"[green]✔[/green] Logged in as [cyan]{username}[/cyan] to [link={base_url.rstrip('/')}][blue underline]<{base_url.rstrip('/')}>" ) + session.base_url = base_url return session def default_artsurvey_settings() -> dict[str, str]: @@ -118,7 +119,7 @@ def default_artsurvey_settings() -> dict[str, str]: "class_startswith": "", "cname_contains": "", "category": "", - "exclude_category": "", + "exclude_category": "" # "category_unclassified": "", # "gaia_primary": "", # "allwise_primary": "", @@ -132,10 +133,31 @@ def default_artsurvey_settings() -> dict[str, str]: # "circle_rmax_deg": "" } -def get_artsurvey_cat(session: requests.Session, survey_name: str = "S1-5v12345.12", **kwargs): - # 2. Получаем форму настроек +def get_artsurvey_cat( + session: requests.Session, + survey_name: str = "S1-5v12345.12", + **kwargs +) -> pd.DataFrame: + """ + Получить ARTSurvey каталог с фильтрами. + + Parameters + ---------- + session : requests.Session + Авторизованная сессия (через artsurvey_session()). + survey_name : str + Название обзора (как указано в форме, например "S1-5v12345.12"). + **kwargs : dict + Перекрывающие значения параметров формы. + + Returns + ------- + pd.DataFrame + """ + base_url = session.base_url + # 1. Переход на страницу настроек with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress: - progress.add_task(description="Обновление фильтров обзора", total=None) + progress.add_task(description="Обновление фильтров ARTSurvey", total=None) settings_url = f"{base_url.rstrip('/')}/artsurvey" resp = session.get(settings_url) resp.raise_for_status() @@ -143,58 +165,55 @@ def get_artsurvey_cat(session: requests.Session, survey_name: str = "S1-5v12345. form = soup.find("form") if not form: - console.print("[red]❌ Не найдена форма фильтров на странице artsurvey.[/red]") + console.print("[red]❌ Форма фильтров не найдена.[/red]") return pd.DataFrame() - # 3. Собираем параметры формы + # 2. Получаем список доступных обзоров + select = form.find("select", {"id": "id_survey"}) + if not select: + console.print("[red]❌ Список обзоров (id_survey) не найден.[/red]") + return pd.DataFrame() + + survey_options = { + option.text.strip(): option.get("value") + for option in select.find_all("option") + if option.get("value") + } + + if survey_name not in survey_options: + console.print(f"[red]❌ Обзор '{survey_name}' не найден. Доступные: {list(survey_options.keys())}[/red]") + return pd.DataFrame() + + # 3. Объединяем параметры + params = {**default_artsurvey_settings(), **kwargs} + params["survey"] = survey_options[survey_name] + + # 4. Подготовка payload (фильтры, которые реально присутствуют в форме) payload = {} + for tag in form.find_all(["input", "select"]): + name = tag.get("name") + if name and name in params: + payload[name] = params[name] - # 3.1 input-поля - for input_tag in form.find_all("input"): - name = input_tag.get("name") - if not name: - continue - payload[name] = input_tag.get("value", "") - - # 3.2 select-поля - for select_tag in form.find_all("select"): - name = select_tag.get("name") - if not name: - continue - options = select_tag.find_all("option") - selected = None - for option in options: - if option.text.strip() == survey_name: - selected = option.get("value") - break - if selected: - payload[name] = selected - - # 3.3 пользовательские аргументы (приоритетные) - for k, v in kwargs.items(): - payload[k] = v - - # 4. Отправляем фильтры action = form.get("action") or settings_url full_action = action if action.startswith("http") else requests.compat.urljoin(settings_url, action) + resp = session.post(full_action, data=payload, headers={"Referer": settings_url}) resp.raise_for_status() - # 5. Загружаем CSV + # 5. Скачиваем CSV with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress: - progress.add_task(description="Загрузка каталога ART-Survey (~10-20 секунд)", total=None) + progress.add_task(description="Загрузка ARTSurvey каталога", total=None) csv_url = f"{base_url.rstrip('/')}/artsurvey/csv/all" resp = session.get(csv_url) resp.raise_for_status() - df = pd.read_csv(pd.compat.StringIO(resp.text), na_values=["", "None"]) + df = pd.read_csv(StringIO(resp.text), na_values=["", "None"]) - # 6. Обработка if df.empty: - console.print("[red]Каталог пуст. Возможно, фильтры слишком строгие.[/red]") + console.print("[red]Каталог пуст. Проверьте фильтры.[/red]") return df - if "ra" in df.columns and "dec" in df.columns: - coords = SkyCoord(ra=df["ra"].values * u.deg, dec=df["dec"].values * u.deg, frame="icrs") - df["skycoord"] = coords + if {"ra", "dec"}.issubset(df.columns): + df["skycoord"] = SkyCoord(df["ra"] * u.deg, df["dec"] * u.deg, frame="icrs") return df diff --git a/src/srgweb/publications.py b/src/srgweb/publications.py new file mode 100644 index 0000000..4fbc238 --- /dev/null +++ b/src/srgweb/publications.py @@ -0,0 +1,155 @@ +"""Utility functions to scrape SRG publication & telegram pages. + +Dependencies +------------ +- requests +- beautifulsoup4 +- pandas +- rich (optional, for nice progress) + +Example +------- +>>> from srg_publications import ( +... parse_srg_paper_links, +... get_df_from_srg_papers, +... get_df_from_srg_telegrams, +... ) +>>> df_papers = get_df_from_srg_papers() +>>> df_tg = get_df_from_srg_telegrams() +""" + +from __future__ import annotations + +import re +from typing import List, Dict + +import requests +from bs4 import BeautifulSoup, Tag +import pandas as pd +from rich.progress import Progress, SpinnerColumn, TextColumn + +BASE_SITE = "https://www.srg.cosmos.ru" +PUBLICATIONS_URL = f"{BASE_SITE}/publications/" +TELEGRAMS_ATEL_URL = f"{BASE_SITE}/publications/telegrams/atel" +TELEGRAMS_GCN_URL = f"{BASE_SITE}/publications/telegrams/gcn" + + +def clear_arxiv_link(arxiv_abs_link: str | None) -> str | None: + """Normalise an arXiv *abs* URL into canonical `` form. + + Examples + -------- + >>> clear_arxiv_link("https://arxiv.org/abs/2301.01234v2") + '2301.01234' + >>> clear_arxiv_link("arXiv:2209.00001v1") + '2209.00001' + """ + if not arxiv_abs_link: + return None + # remove version suffix like v2 + cleaned = re.sub(r"v\d+$", "", arxiv_abs_link.strip()) + # remove protocol and prefix + cleaned = re.sub(r"https?://arxiv\.org/abs/", "", cleaned) + cleaned = cleaned.replace("arXiv:", "") + return cleaned + +# ---------------------------------------------------------------------------- +# 1. Publication list helpers +# ---------------------------------------------------------------------------- + +def _session_for(url: str) -> requests.Session: + sess = requests.Session() + sess.headers.update({ + "User-Agent": "srgweb, Python package (uskov@cosmos.ru)" + }) + # prime cookies + sess.get(url) + return sess + + +def parse_srg_paper_links(page_url: str) -> dict[str, str | None]: + """Parse individual SRG paper page and return arXiv + ADS links. + + Parameters + ---------- + page_url : str + Full URL to SRG paper detail page. + + Returns + ------- + dict with keys ``srg_arxiv_url`` and ``srg_bibcode``. + Missing links are ``None``. + """ + sess = _session_for(page_url) + soup = BeautifulSoup(sess.get(page_url).text, "html.parser") + + paper_links = [a.get("href") for a in soup.select("li a[href]")] + arxiv_link = next((l for l in paper_links if "arxiv.org/abs" in l), None) + adsabs_link = next((l for l in paper_links if "ui.adsabs.harvard.edu" in l), None) + + return { + "srg_arxiv_url": arxiv_link, + "srg_bibcode": adsabs_link, + } + + +def get_srg_publications(progress: bool = True) -> pd.DataFrame: + """Scrape the main publications page and return a DataFrame. + + Columns + ------- + title_srg : Publication title shown on SRG site. + page_srg : Full SRG page URL. + srg_arxiv : Canonical arXiv ID (if any). + srg_bibcode : ADS bibcode (if any). + srg_arxiv_url : original arXiv URL. + """ + sess = _session_for(PUBLICATIONS_URL) + + soup = BeautifulSoup(sess.get(PUBLICATIONS_URL).text, "html.parser") + + # Remove buttons that interfere with finding + for btn in soup.select(".btn"): + btn.decompose() + + anchors = soup.select("tbody a") + + titles: List[str] = [a.select_one("strong").text.strip() for a in anchors] + page_urls: List[str] = [BASE_SITE + a.get("href") for a in anchors] + + iterator = zip(titles, page_urls) + + records: List[Dict[str, str | None]] = [] + + if progress: + bar = Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True) + task_desc = "Parsing arXiv/ADS links" + with bar: + t = bar.add_task(task_desc, total=len(titles)) + for title, link in iterator: + links = parse_srg_paper_links(link) + rec = { + "title_srg": title, + "page_srg": link, + **links, + } + rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"]) + if rec["srg_bibcode"]: + rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"]) + records.append(rec) + bar.update(t, advance=1) + else: + for title, link in iterator: + links = parse_srg_paper_links(link) + rec = { + "title_srg": title, + "page_srg": link, + **links, + } + rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"]) + if rec["srg_bibcode"]: + rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"]) + records.append(rec) + + return pd.DataFrame.from_records(records) +