diff --git a/.gitignore b/.gitignore index 62b28c2..69e457e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__/ .quarto reference/ -tests/ \ No newline at end of file +tests/ +examples.py \ No newline at end of file diff --git a/README.html b/README.html new file mode 100644 index 0000000..27cc954 --- /dev/null +++ b/README.html @@ -0,0 +1,545 @@ + + + + + + + + + +readme + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + +
+

srgweb

+

Python client for SRG web services

+
+

Installation

+

Install the latest version from the repository:

+
pip install git+https://github.com/uskovgs/srgweb/
+
+
+

Working with https://www.srg.cosmos.ru/triton/

+

To avoid entering your password in the terminal, you can store your token securely using the keyring package:

+
# ! pip install keyring
+import keyring
+# Save your token (one time)
+keyring.set_password("MY_TOKEN_NAME", "username", "12345")
+# get your passrd
+keyring.get_password("MY_TOKEN_NAME", "username")
+# Out: 12345
+

This way, your password/token is not stored in your scripts or visible in the terminal.

+
from srgweb.triton import (
+    triton_session, 
+    list_programs, 
+    get_program,
+    list_baskets,
+    get_basket
+)
+import keyring
+
+# login to triton
+sess = triton_session(
+    username = "username", 
+    password = keyring.get_password("MY_TOKEN_NAME", "username")
+)
+# list available programs
+programs = list_programs(sess)
+# download program SRGA
+df = get_program(sess, program="SRGA")
+# list available baskets
+baskets = list_baskets(sess)
+# download basket ART-XC agns
+df_basket = get_basket(sess, basket='ART-XC agns')
+
+
+

Working with https://www.srg.cosmos.ru/publications/

+
from srgweb.publications import get_srg_publications
+
+# Get a list of publications
+publications = get_srg_publications()
+
+
+

Passwords and keyring usage

+
+
+ +
+ + +
+ + + + + \ No newline at end of file diff --git a/README.md b/README.md index 9001946..148d2ad 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ -# srgweb: Python client for SRG web services +# srgweb + +Python client for SRG web services: triton and publications. ## Installation Install the latest version from the repository: -```bash +``` bash pip install git+https://github.com/uskovgs/srgweb/ ``` @@ -18,23 +20,45 @@ from srgweb.triton import ( list_baskets, get_basket ) -import keyring # login to triton -sess = triton_session("uskov", keyring.get_password("PLAN_SRG", "")) +session = triton_session("username", "password") + # list available programs -programs = list_programs(sess) -# download program SRGA -df = get_program(sess, program="SRGA") +programs = list_programs(session) + +# download program "SRGA" (case insensitive) +df = get_program(session, program="srga") + # list available baskets -baskets = list_baskets(sess) +baskets = list_baskets(session) + # download basket ART-XC agns -df_basket = get_basket(sess, basket='ART-XC agns') +df_basket = get_basket(session, basket='ART-XC agns') ``` +To avoid entering your password in the terminal, you can store your token securely using the [keyring](https://pypi.org/project/keyring/) package: + +``` python +# ! pip install keyring +import keyring +# Save your token (one time) +keyring.set_password("MY_TOKEN_NAME", "username", "12345") + +# Now you can use the keyring to get your password/token in your script +from srgweb.triton import triton_session +session = triton_session( + username = "username", + password = keyring.get_password("MY_TOKEN_NAME", "username") +) +``` + +This way, your password/token is not stored in your scripts or visible in the terminal. + + ## Working with https://www.srg.cosmos.ru/publications/ -```python +``` python from srgweb.publications import get_srg_publications # Get a list of publications diff --git a/_quarto.yml b/_quarto.yml deleted file mode 100644 index 02023aa..0000000 --- a/_quarto.yml +++ /dev/null @@ -1,10 +0,0 @@ -quartodoc: - style: pkgdown - dir: reference - package: quartodoc - sections: - - title: Some functions - desc: Functions to inspect docstrings. - contents: - - get_object - - preview \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 700e577..05fa491 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,18 @@ [project] name = "srgweb" version = "0.1.0" -description = "python interface to internal web services" +description = "Python client for SRG web services" authors = [ {name = "uskovgs",email = "uskov@cosmos.ru"} ] readme = "README.md" -requires-python = ">=3.12" +requires-python = ">=3.9" dependencies = [ + "requests", + "beautifulsoup4", + "pandas", + "rich", + "pyjanitor" ] diff --git a/reference/get_object.qmd b/reference/get_object.qmd deleted file mode 100644 index 933ac83..0000000 --- a/reference/get_object.qmd +++ /dev/null @@ -1,41 +0,0 @@ -# get_object { #quartodoc.get_object } - -```python -get_object( - path, - object_name=None, - parser='numpy', - load_aliases=True, - dynamic=False, - loader=None, -) -``` - -Fetch a griffe object. - -## Parameters {.doc-section .doc-section-parameters} - -| Name | Type | Description | Default | -|--------------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|------------| -| path | str | An import path to the object. This should have the form `path.to.module:object`. For example, `quartodoc:get_object` or `quartodoc:MdRenderer.render`. | _required_ | -| object_name | \'str \| None\' | (Deprecated). A function name. | `None` | -| parser | str | A docstring parser to use. | `'numpy'` | -| load_aliases | | For aliases that were imported from other modules, should we load that module? | `True` | -| dynamic | | Whether to dynamically import object. Useful if docstring is not hard-coded, but was set on object by running python code. | `False` | - -## See Also {.doc-section .doc-section-see-also} - -preview: print a user-friendly preview of a griffe object. - -## Examples {.doc-section .doc-section-examples} - -```python ->>> get_function("quartodoc", "get_function") ->> from quartodoc import get_object ->>> obj = get_object("quartodoc", "get_object") -``` - -```python ->>> preview(obj.docstring.parsed) -... -``` - -```python ->>> preview(obj) -... -``` \ No newline at end of file diff --git a/src/srgweb/publications.py b/src/srgweb/publications.py index 4fbc238..6dfb150 100644 --- a/src/srgweb/publications.py +++ b/src/srgweb/publications.py @@ -1,62 +1,33 @@ -"""Utility functions to scrape SRG publication & telegram pages. - -Dependencies ------------- -- requests -- beautifulsoup4 -- pandas -- rich (optional, for nice progress) - -Example -------- ->>> from srg_publications import ( -... parse_srg_paper_links, -... get_df_from_srg_papers, -... get_df_from_srg_telegrams, -... ) ->>> df_papers = get_df_from_srg_papers() ->>> df_tg = get_df_from_srg_telegrams() -""" - -from __future__ import annotations - import re -from typing import List, Dict - import requests -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup + import pandas as pd -from rich.progress import Progress, SpinnerColumn, TextColumn +from rich.console import Console BASE_SITE = "https://www.srg.cosmos.ru" PUBLICATIONS_URL = f"{BASE_SITE}/publications/" -TELEGRAMS_ATEL_URL = f"{BASE_SITE}/publications/telegrams/atel" -TELEGRAMS_GCN_URL = f"{BASE_SITE}/publications/telegrams/gcn" + +console = Console() -def clear_arxiv_link(arxiv_abs_link: str | None) -> str | None: +def _clear_arxiv_link(arxiv_abs_link: str | None) -> str | None: """Normalise an arXiv *abs* URL into canonical `` form. Examples -------- - >>> clear_arxiv_link("https://arxiv.org/abs/2301.01234v2") + >>> _clear_arxiv_link("https://arxiv.org/abs/2301.01234v2") '2301.01234' - >>> clear_arxiv_link("arXiv:2209.00001v1") + >>> _clear_arxiv_link("arXiv:2209.00001v1") '2209.00001' """ if not arxiv_abs_link: return None - # remove version suffix like v2 cleaned = re.sub(r"v\d+$", "", arxiv_abs_link.strip()) - # remove protocol and prefix cleaned = re.sub(r"https?://arxiv\.org/abs/", "", cleaned) cleaned = cleaned.replace("arXiv:", "") return cleaned -# ---------------------------------------------------------------------------- -# 1. Publication list helpers -# ---------------------------------------------------------------------------- - def _session_for(url: str) -> requests.Session: sess = requests.Session() sess.headers.update({ @@ -67,7 +38,7 @@ def _session_for(url: str) -> requests.Session: return sess -def parse_srg_paper_links(page_url: str) -> dict[str, str | None]: +def _parse_srg_paper_links(page_url: str) -> dict[str, str | None]: """Parse individual SRG paper page and return arXiv + ADS links. Parameters @@ -84,16 +55,15 @@ def parse_srg_paper_links(page_url: str) -> dict[str, str | None]: soup = BeautifulSoup(sess.get(page_url).text, "html.parser") paper_links = [a.get("href") for a in soup.select("li a[href]")] - arxiv_link = next((l for l in paper_links if "arxiv.org/abs" in l), None) - adsabs_link = next((l for l in paper_links if "ui.adsabs.harvard.edu" in l), None) - + arxiv_link = next((link for link in paper_links if link and "arxiv.org/abs" in link), None) + adsabs_link = next((link for link in paper_links if link and "ui.adsabs.harvard.edu" in link), None) return { "srg_arxiv_url": arxiv_link, "srg_bibcode": adsabs_link, } -def get_srg_publications(progress: bool = True) -> pd.DataFrame: +def get_srg_publications() -> pd.DataFrame: """Scrape the main publications page and return a DataFrame. Columns @@ -106,50 +76,32 @@ def get_srg_publications(progress: bool = True) -> pd.DataFrame: """ sess = _session_for(PUBLICATIONS_URL) - soup = BeautifulSoup(sess.get(PUBLICATIONS_URL).text, "html.parser") + with console.status("Loading SRG publications page", spinner="dots"): + soup = BeautifulSoup(sess.get(PUBLICATIONS_URL).text, "html.parser") - # Remove buttons that interfere with finding - for btn in soup.select(".btn"): - btn.decompose() + # Remove buttons that interfere with finding + for btn in soup.select(".btn"): + btn.decompose() - anchors = soup.select("tbody a") + anchors = soup.select("tbody a") - titles: List[str] = [a.select_one("strong").text.strip() for a in anchors] - page_urls: List[str] = [BASE_SITE + a.get("href") for a in anchors] + titles: list[str] = [a.select_one("strong").text.strip() for a in anchors] + page_urls: list[str] = [BASE_SITE + a.get("href") for a in anchors] - iterator = zip(titles, page_urls) + iterator = zip(titles, page_urls) - records: List[Dict[str, str | None]] = [] - - if progress: - bar = Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True) - task_desc = "Parsing arXiv/ADS links" - with bar: - t = bar.add_task(task_desc, total=len(titles)) - for title, link in iterator: - links = parse_srg_paper_links(link) - rec = { - "title_srg": title, - "page_srg": link, - **links, - } - rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"]) - if rec["srg_bibcode"]: - rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"]) - records.append(rec) - bar.update(t, advance=1) - else: + records: list[dict[str, str | None]] = [] for title, link in iterator: - links = parse_srg_paper_links(link) + links = _parse_srg_paper_links(link) rec = { "title_srg": title, "page_srg": link, **links, } - rec["srg_arxiv"] = clear_arxiv_link(rec["srg_arxiv_url"]) + rec["srg_arxiv"] = _clear_arxiv_link(rec["srg_arxiv_url"]) if rec["srg_bibcode"]: rec["srg_bibcode"] = re.sub(r"https?://ui\.adsabs\.harvard\.edu/abs/", "", rec["srg_bibcode"]) records.append(rec) - + console.print(f"[green]✔[/green] Loaded {len(records)} publications from SRG site") return pd.DataFrame.from_records(records)