initial commit

This commit is contained in:
Никита Тырин 2024-09-10 12:28:01 +03:00
parent 92a8174bdf
commit db126af85a
19 changed files with 655 additions and 0 deletions

129
create_model.ipynb Normal file
View File

@ -0,0 +1,129 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Automatically create the django model using pandas for type inference"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from django.db import models"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"read the csv file"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"GaiaSource_000000-003111.csv\", comment='#')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"map pandas types to django fields"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"dtype_mapping = {\n",
" 'int64' : 'models.IntegerField(null=True)',\n",
" 'float64' : 'models.FloatField(null=True)',\n",
" 'bool' : 'models.BooleanField(null=True)',\n",
" 'object' : 'models.CharField(max_length=255, null=True)'\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"generate the model code line by line"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def generate_model(df, model_name):\n",
"\n",
" class_code = f\"class {model_name}(models.Model):\\n\"\n",
"\n",
" for column in df.columns:\n",
" dtype = str(df[column].dtype)\n",
" field_type = dtype_mapping.get(dtype, 'models.CharField(max_length=255, null=True)')\n",
" class_code += f\" {column} = {field_type}\\n\"\n",
" return class_code\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_code = generate_model(df, 'samplemodel')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"save the model code to the models.py file to be used"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"with open('gaia_orm/sample_app/models.py', 'a') as file:\n",
" file.write(model_code)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0
gaia_orm/__init__.py Normal file
View File

16
gaia_orm/asgi.py Normal file
View File

@ -0,0 +1,16 @@
"""
ASGI config for gaia_orm project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings')
application = get_asgi_application()

125
gaia_orm/settings.py Normal file
View File

@ -0,0 +1,125 @@
"""
Django settings for gaia_orm project.
Generated by 'django-admin startproject' using Django 5.1.1.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.1/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-k_$r^3e!9ycqnt0=+ur&sx#hsl44_+v3=al2$_gpnh3u^w!xj$'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
#'rest_framework',
'sample_app',
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'gaia_orm.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'gaia_orm.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/5.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'

24
gaia_orm/urls.py Normal file
View File

@ -0,0 +1,24 @@
"""
URL configuration for gaia_orm project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/5.1/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
path('api/', include('sample_app.urls')), # Include your app's URLs
]

16
gaia_orm/wsgi.py Normal file
View File

@ -0,0 +1,16 @@
"""
WSGI config for gaia_orm project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings')
application = get_wsgi_application()

144
ingester.py Normal file
View File

@ -0,0 +1,144 @@
import os
import sys
import glob
import uuid
import asyncio
from datetime import datetime, timedelta
import django
from asgiref.sync import sync_to_async
import pandas as pd
#environment init for django
current_dir = os.getcwd()
relative_path = os.path.join(current_dir, 'gaia_orm')
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings')
sys.path.append(os.path.normpath(relative_path))
#sys.path.append('/home/kityr/practiceproject/gaia_orm')
django.setup()
#import models for both the sources and the files
from sample_app.models import GaiaSource, CatalogFile
#fetching the file list
directory = input("Please enter the path to the directory containing the csv files (or leave empty if the files are in the same directory as this script): ")
csv_files = glob.glob(os.path.join(directory, '*csv*'))
print(f"Files found: {len(csv_files)}.")
print("Populating the file database...")
#initialize the counter
new_files_count = 0
#add files as catalogfile instances into the database
for file_path in csv_files:
file_name = os.path.basename(file_path)
#use get_or_create to not add files twice
catalog_file, created = CatalogFile.objects.get_or_create(
name=file_name,
defaults={'uuid': uuid.uuid4(), 'status': 'PENDING'}
)
if created:
new_files_count += 1
#show how many duplicates were already in db
print(f"File database populated. {len(csv_files) - new_files_count} were already in the database.")
input("Press Enter to continue...")
#bulk creation function
@sync_to_async
def bulk_create_gaia_sources(instances): #catalog status update to ingested after ingesting the sources
GaiaSource.objects.bulk_create(instances)
@sync_to_async
def update_catalog_file_status(catalog_file, status): #catalog file status updater
catalog_file.status = status
catalog_file.save()
@sync_to_async
def get_all_catalog_files(): #catalog file list getter
return list(CatalogFile.objects.all())
@sync_to_async
def delete_gaiasources_for_catalogfile(catalog_file): #for deleting the sources from partially ingested files in case of improper shutdown
GaiaSource.objects.filter(catalog_file=catalog_file).delete()
@sync_to_async
def count_ingested_files():
return CatalogFile.objects.filter(status='INGESTED').count()
current_time = (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S")
print(f"[{current_time}] Starting the data ingestion.")
#function that iterates over all catalog files and ingests sources from them
async def ingest_files():
#catalog_files = CatalogFile.objects.all()
catalog_files = await get_all_catalog_files()
for catalog_file in catalog_files:
if catalog_file.status == 'INGESTED':
print(f"Skipping {catalog_file.name} as it is already ingested.")
continue
file_path = os.path.join(directory, catalog_file.name)
#print(file_path)
if os.path.exists(file_path): #check if the file exists at all just in case
if catalog_file.status == 'IN_PROGRESS':
print(f"{catalog_file.name} seems to have been interrupted, starting over.")#if the file status is in_progress, it must've
await delete_gaiasources_for_catalogfile(catalog_file) #been interrupted mid-ingest, delete all the
#associated sources if there are any and start over
#comment # to avoid the ecsv metadata lines
#switched to header=1000 and pyarrow backend for speed
df = pd.read_csv(
file_path,
#comment='#',
header=1000,
engine="pyarrow"
)
gaiasource_fields = [field.name for field in GaiaSource._meta.get_fields()] #get fields from the model
common_fields = [field for field in gaiasource_fields if field in df.columns] #find common fields between the df and the model
#this way we can add new fields to the model and
df_filtered = df[common_fields] #the code will pick them up here
data_dict = df_filtered.to_dict(orient='records') #translate the df into a dict
gaia_source_instances = [
GaiaSource(**data, catalog_file=catalog_file) for data in data_dict #create gaiasource instances, set the foreignkey
]
await update_catalog_file_status(catalog_file, 'IN_PROGRESS')
await bulk_create_gaia_sources(gaia_source_instances) #bulk-create instances from the dict
await update_catalog_file_status(catalog_file,'INGESTED') #update the catalogfile instance status field to 'INGESTED'
ingested_files_count = await count_ingested_files()
current_time = (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S") #Timestamp and progress print statement
print(f"[{current_time}] {ingested_files_count}/{len(catalog_files)}")
asyncio.run(ingest_files())

22
manage.py Executable file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

0
sample_app/__init__.py Normal file
View File

3
sample_app/admin.py Normal file
View File

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

6
sample_app/apps.py Normal file
View File

@ -0,0 +1,6 @@
from django.apps import AppConfig
class SampleAppConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'sample_app'

View File

@ -0,0 +1,48 @@
# Generated by Django 5.1.1 on 2024-09-09 12:01
import django.db.models.deletion
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='CatalogFile',
fields=[
('uuid', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=32)),
('status', models.CharField(choices=[('PENDING', 'Pending'), ('INGESTED', 'Ingested'), ('INDEXED', 'Indexed')], default='PENDING', max_length=10)),
],
),
migrations.CreateModel(
name='GaiaSource',
fields=[
('uuid', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('solution_id', models.CharField(blank=True, default='', max_length=19)),
('designation', models.CharField(blank=True, default='', max_length=30)),
('source_id', models.CharField(blank=True, default='', max_length=19)),
('ref_epoch', models.FloatField(default=0.0, null=True)),
('ra', models.FloatField(default=0.0, null=True)),
('ra_error', models.FloatField(default=0.0, null=True)),
('dec', models.FloatField(default=0.0, null=True)),
('dec_error', models.FloatField(default=0.0, null=True)),
('parallax', models.FloatField(default=0.0, null=True)),
('parallax_error', models.FloatField(default=0.0, null=True)),
('pmra', models.FloatField(default=0.0, null=True)),
('pmra_error', models.FloatField(default=0.0, null=True)),
('pmdec', models.FloatField(default=0.0, null=True)),
('pmdec_error', models.FloatField(default=0.0, null=True)),
('phot_g_mean_mag', models.FloatField(default=0.0, null=True)),
('phot_bp_mean_mag', models.FloatField(default=0.0, null=True)),
('phot_rp_mean_mag', models.FloatField(default=0.0, null=True)),
('catalog_file', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='sources', to='sample_app.catalogfile')),
],
),
]

View File

@ -0,0 +1,18 @@
# Generated by Django 5.1.1 on 2024-09-09 14:26
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sample_app', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='catalogfile',
name='status',
field=models.CharField(choices=[('PENDING', 'Pending'), ('IN_PROGRESS', 'In Progress'), ('INGESTED', 'Ingested'), ('INDEXED', 'Indexed')], default='PENDING', max_length=11),
),
]

View File

74
sample_app/models.py Normal file
View File

@ -0,0 +1,74 @@
from django.db import models
import uuid
class CatalogFile(models.Model):
uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=32)
STATUS_CHOICES = [
('PENDING', 'Pending'),
('IN_PROGRESS', 'In Progress'),
('INGESTED', 'Ingested'),
('INDEXED', 'Indexed')
]
status = models.CharField(max_length=11, choices=STATUS_CHOICES, default='PENDING')
class GaiaSource(models.Model):
uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
catalog_file = models.ForeignKey(
CatalogFile, on_delete=models.CASCADE,
related_name='sources', null=True
)
solution_id = models.CharField(blank=True, default='', max_length=19)
#solution identifier
#why charfield and not integerfield?
designation = models.CharField(max_length=30, blank=True, default='')
#unique source designation across all DR
source_id = models.CharField(max_length=19, blank=True, default='')
#unique id within DR again why not integer
ref_epoch = models.FloatField(default=0.0, null=True)
#reference epoch julian years
ra = models.FloatField(default=0.0, null=True)
ra_error = models.FloatField(default=0.0, null=True)
#barycentric ra in icrs at ref epoch
#error in mas
dec = models.FloatField(default=0.0, null=True)
dec_error = models.FloatField(default=0.0, null=True)
#barycentric dec in icrs at ref epoch
#error in mas
parallax = models.FloatField(default=0.0, null=True)
parallax_error = models.FloatField(default=0.0, null=True)
#parallax and error at ref epoch in mas
pmra = models.FloatField(default=0.0, null=True)
pmra_error = models.FloatField(default=0.0, null=True)
#proper motion over ra mas/yr
pmdec = models.FloatField(default=0.0, null=True)
pmdec_error = models.FloatField(default=0.0, null=True)
#proper motion over dec mas/yr
phot_g_mean_mag = models.FloatField(default=0.0, null=True)
#mean g band magnitude, vega scale
phot_bp_mean_mag = models.FloatField(default=0.0, null=True)
#mean bp magnitude, vega scale
phot_rp_mean_mag = models.FloatField(default=0.0, null=True)
#mean rp magnitude, vega scale

View File

@ -0,0 +1,7 @@
from rest_framework import serializers
from .models import GaiaSource
class GaiaSourceSerializer(serializers.ModelSerializer):
class Meta:
model = GaiaSource
fields = '__all__'

3
sample_app/tests.py Normal file
View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

7
sample_app/urls.py Normal file
View File

@ -0,0 +1,7 @@
from django.urls import path
from .views import GaiaSourceListCreate, GaiaSourceDetail
urlpatterns = [
path('GaiaSource/', GaiaSourceListCreate.as_view(), name='GaiaSource-list-create'),
path('GaiaSource/<int:pk>/', GaiaSourceDetail.as_view(), name='GaiaSource-detail'),
]

13
sample_app/views.py Normal file
View File

@ -0,0 +1,13 @@
from django.shortcuts import render
from rest_framework import generics
from .models import GaiaSource
from .serializers import GaiaSourceSerializer
class GaiaSourceListCreate(generics.ListCreateAPIView):
queryset = GaiaSource.objects.all()
serializer_class = GaiaSourceSerializer
class GaiaSourceDetail(generics.RetrieveUpdateDestroyAPIView):
queryset = GaiaSource.objects.all()
serializer_class = GaiaSourceSerializer