diff --git a/create_model.ipynb b/create_model.ipynb new file mode 100644 index 0000000..4192e97 --- /dev/null +++ b/create_model.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Automatically create the django model using pandas for type inference" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from django.db import models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "read the csv file" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"GaiaSource_000000-003111.csv\", comment='#')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "map pandas types to django fields" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "dtype_mapping = {\n", + " 'int64' : 'models.IntegerField(null=True)',\n", + " 'float64' : 'models.FloatField(null=True)',\n", + " 'bool' : 'models.BooleanField(null=True)',\n", + " 'object' : 'models.CharField(max_length=255, null=True)'\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "generate the model code line by line" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_model(df, model_name):\n", + "\n", + " class_code = f\"class {model_name}(models.Model):\\n\"\n", + "\n", + " for column in df.columns:\n", + " dtype = str(df[column].dtype)\n", + " field_type = dtype_mapping.get(dtype, 'models.CharField(max_length=255, null=True)')\n", + " class_code += f\" {column} = {field_type}\\n\"\n", + " return class_code\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_code = generate_model(df, 'samplemodel')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "save the model code to the models.py file to be used" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "with open('gaia_orm/sample_app/models.py', 'a') as file:\n", + " file.write(model_code)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gaia_orm/__init__.py b/gaia_orm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gaia_orm/asgi.py b/gaia_orm/asgi.py new file mode 100644 index 0000000..49ca4b0 --- /dev/null +++ b/gaia_orm/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for gaia_orm project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings') + +application = get_asgi_application() diff --git a/gaia_orm/settings.py b/gaia_orm/settings.py new file mode 100644 index 0000000..8387ea1 --- /dev/null +++ b/gaia_orm/settings.py @@ -0,0 +1,125 @@ +""" +Django settings for gaia_orm project. + +Generated by 'django-admin startproject' using Django 5.1.1. + +For more information on this file, see +https://docs.djangoproject.com/en/5.1/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/5.1/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-k_$r^3e!9ycqnt0=+ur&sx#hsl44_+v3=al2$_gpnh3u^w!xj$' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + #'rest_framework', + 'sample_app', + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'gaia_orm.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'gaia_orm.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/5.1/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/5.1/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/5.1/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' diff --git a/gaia_orm/urls.py b/gaia_orm/urls.py new file mode 100644 index 0000000..3723feb --- /dev/null +++ b/gaia_orm/urls.py @@ -0,0 +1,24 @@ +""" +URL configuration for gaia_orm project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/5.1/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path, include + + +urlpatterns = [ + path('admin/', admin.site.urls), + path('api/', include('sample_app.urls')), # Include your app's URLs +] diff --git a/gaia_orm/wsgi.py b/gaia_orm/wsgi.py new file mode 100644 index 0000000..e9e08ca --- /dev/null +++ b/gaia_orm/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for gaia_orm project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings') + +application = get_wsgi_application() diff --git a/ingester.py b/ingester.py new file mode 100644 index 0000000..c7abc94 --- /dev/null +++ b/ingester.py @@ -0,0 +1,144 @@ +import os +import sys +import glob +import uuid +import asyncio +from datetime import datetime, timedelta + +import django +from asgiref.sync import sync_to_async + +import pandas as pd + + + +#environment init for django +current_dir = os.getcwd() +relative_path = os.path.join(current_dir, 'gaia_orm') +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings') + +sys.path.append(os.path.normpath(relative_path)) +#sys.path.append('/home/kityr/practiceproject/gaia_orm') +django.setup() + + + +#import models for both the sources and the files +from sample_app.models import GaiaSource, CatalogFile + + + + +#fetching the file list +directory = input("Please enter the path to the directory containing the csv files (or leave empty if the files are in the same directory as this script): ") +csv_files = glob.glob(os.path.join(directory, '*csv*')) +print(f"Files found: {len(csv_files)}.") +print("Populating the file database...") +#initialize the counter +new_files_count = 0 +#add files as catalogfile instances into the database +for file_path in csv_files: + file_name = os.path.basename(file_path) + #use get_or_create to not add files twice + catalog_file, created = CatalogFile.objects.get_or_create( + name=file_name, + defaults={'uuid': uuid.uuid4(), 'status': 'PENDING'} + ) + + if created: + new_files_count += 1 +#show how many duplicates were already in db +print(f"File database populated. {len(csv_files) - new_files_count} were already in the database.") + + + +input("Press Enter to continue...") + + + +#bulk creation function +@sync_to_async +def bulk_create_gaia_sources(instances): #catalog status update to ingested after ingesting the sources + GaiaSource.objects.bulk_create(instances) + +@sync_to_async +def update_catalog_file_status(catalog_file, status): #catalog file status updater + catalog_file.status = status + catalog_file.save() + +@sync_to_async +def get_all_catalog_files(): #catalog file list getter + return list(CatalogFile.objects.all()) + +@sync_to_async +def delete_gaiasources_for_catalogfile(catalog_file): #for deleting the sources from partially ingested files in case of improper shutdown + GaiaSource.objects.filter(catalog_file=catalog_file).delete() + +@sync_to_async +def count_ingested_files(): + return CatalogFile.objects.filter(status='INGESTED').count() + + + +current_time = (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S") +print(f"[{current_time}] Starting the data ingestion.") + +#function that iterates over all catalog files and ingests sources from them +async def ingest_files(): + + #catalog_files = CatalogFile.objects.all() + + catalog_files = await get_all_catalog_files() + + for catalog_file in catalog_files: + + if catalog_file.status == 'INGESTED': + print(f"Skipping {catalog_file.name} as it is already ingested.") + continue + + file_path = os.path.join(directory, catalog_file.name) + + #print(file_path) + + if os.path.exists(file_path): #check if the file exists at all just in case + + if catalog_file.status == 'IN_PROGRESS': + print(f"{catalog_file.name} seems to have been interrupted, starting over.")#if the file status is in_progress, it must've + await delete_gaiasources_for_catalogfile(catalog_file) #been interrupted mid-ingest, delete all the + #associated sources if there are any and start over + + #comment # to avoid the ecsv metadata lines + #switched to header=1000 and pyarrow backend for speed + df = pd.read_csv( + file_path, + #comment='#', + header=1000, + engine="pyarrow" + ) + + gaiasource_fields = [field.name for field in GaiaSource._meta.get_fields()] #get fields from the model + + common_fields = [field for field in gaiasource_fields if field in df.columns] #find common fields between the df and the model + #this way we can add new fields to the model and + df_filtered = df[common_fields] #the code will pick them up here + + data_dict = df_filtered.to_dict(orient='records') #translate the df into a dict + + gaia_source_instances = [ + GaiaSource(**data, catalog_file=catalog_file) for data in data_dict #create gaiasource instances, set the foreignkey + ] + + await update_catalog_file_status(catalog_file, 'IN_PROGRESS') + + await bulk_create_gaia_sources(gaia_source_instances) #bulk-create instances from the dict + + await update_catalog_file_status(catalog_file,'INGESTED') #update the catalogfile instance status field to 'INGESTED' + + ingested_files_count = await count_ingested_files() + + current_time = (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S") #Timestamp and progress print statement + print(f"[{current_time}] {ingested_files_count}/{len(catalog_files)}") + + + +asyncio.run(ingest_files()) \ No newline at end of file diff --git a/manage.py b/manage.py new file mode 100755 index 0000000..f5ba0b3 --- /dev/null +++ b/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/sample_app/__init__.py b/sample_app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sample_app/admin.py b/sample_app/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/sample_app/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/sample_app/apps.py b/sample_app/apps.py new file mode 100644 index 0000000..8887c60 --- /dev/null +++ b/sample_app/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class SampleAppConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'sample_app' diff --git a/sample_app/migrations/0001_initial.py b/sample_app/migrations/0001_initial.py new file mode 100644 index 0000000..b18cb0f --- /dev/null +++ b/sample_app/migrations/0001_initial.py @@ -0,0 +1,48 @@ +# Generated by Django 5.1.1 on 2024-09-09 12:01 + +import django.db.models.deletion +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='CatalogFile', + fields=[ + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ('name', models.CharField(max_length=32)), + ('status', models.CharField(choices=[('PENDING', 'Pending'), ('INGESTED', 'Ingested'), ('INDEXED', 'Indexed')], default='PENDING', max_length=10)), + ], + ), + migrations.CreateModel( + name='GaiaSource', + fields=[ + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ('solution_id', models.CharField(blank=True, default='', max_length=19)), + ('designation', models.CharField(blank=True, default='', max_length=30)), + ('source_id', models.CharField(blank=True, default='', max_length=19)), + ('ref_epoch', models.FloatField(default=0.0, null=True)), + ('ra', models.FloatField(default=0.0, null=True)), + ('ra_error', models.FloatField(default=0.0, null=True)), + ('dec', models.FloatField(default=0.0, null=True)), + ('dec_error', models.FloatField(default=0.0, null=True)), + ('parallax', models.FloatField(default=0.0, null=True)), + ('parallax_error', models.FloatField(default=0.0, null=True)), + ('pmra', models.FloatField(default=0.0, null=True)), + ('pmra_error', models.FloatField(default=0.0, null=True)), + ('pmdec', models.FloatField(default=0.0, null=True)), + ('pmdec_error', models.FloatField(default=0.0, null=True)), + ('phot_g_mean_mag', models.FloatField(default=0.0, null=True)), + ('phot_bp_mean_mag', models.FloatField(default=0.0, null=True)), + ('phot_rp_mean_mag', models.FloatField(default=0.0, null=True)), + ('catalog_file', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='sources', to='sample_app.catalogfile')), + ], + ), + ] diff --git a/sample_app/migrations/0002_alter_catalogfile_status.py b/sample_app/migrations/0002_alter_catalogfile_status.py new file mode 100644 index 0000000..6231877 --- /dev/null +++ b/sample_app/migrations/0002_alter_catalogfile_status.py @@ -0,0 +1,18 @@ +# Generated by Django 5.1.1 on 2024-09-09 14:26 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('sample_app', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='catalogfile', + name='status', + field=models.CharField(choices=[('PENDING', 'Pending'), ('IN_PROGRESS', 'In Progress'), ('INGESTED', 'Ingested'), ('INDEXED', 'Indexed')], default='PENDING', max_length=11), + ), + ] diff --git a/sample_app/migrations/__init__.py b/sample_app/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sample_app/models.py b/sample_app/models.py new file mode 100644 index 0000000..6bcf105 --- /dev/null +++ b/sample_app/models.py @@ -0,0 +1,74 @@ +from django.db import models +import uuid + +class CatalogFile(models.Model): + + uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + + name = models.CharField(max_length=32) + + STATUS_CHOICES = [ + ('PENDING', 'Pending'), + ('IN_PROGRESS', 'In Progress'), + ('INGESTED', 'Ingested'), + ('INDEXED', 'Indexed') + ] + + status = models.CharField(max_length=11, choices=STATUS_CHOICES, default='PENDING') + +class GaiaSource(models.Model): + + uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + + catalog_file = models.ForeignKey( + CatalogFile, on_delete=models.CASCADE, + related_name='sources', null=True + ) + + solution_id = models.CharField(blank=True, default='', max_length=19) + #solution identifier + #why charfield and not integerfield? + + designation = models.CharField(max_length=30, blank=True, default='') + #unique source designation across all DR + + source_id = models.CharField(max_length=19, blank=True, default='') + #unique id within DR again why not integer + + ref_epoch = models.FloatField(default=0.0, null=True) + #reference epoch julian years + + ra = models.FloatField(default=0.0, null=True) + ra_error = models.FloatField(default=0.0, null=True) + #barycentric ra in icrs at ref epoch + #error in mas + + dec = models.FloatField(default=0.0, null=True) + dec_error = models.FloatField(default=0.0, null=True) + #barycentric dec in icrs at ref epoch + #error in mas + + parallax = models.FloatField(default=0.0, null=True) + parallax_error = models.FloatField(default=0.0, null=True) + #parallax and error at ref epoch in mas + + pmra = models.FloatField(default=0.0, null=True) + pmra_error = models.FloatField(default=0.0, null=True) + #proper motion over ra mas/yr + + pmdec = models.FloatField(default=0.0, null=True) + pmdec_error = models.FloatField(default=0.0, null=True) + #proper motion over dec mas/yr + + phot_g_mean_mag = models.FloatField(default=0.0, null=True) + #mean g band magnitude, vega scale + + phot_bp_mean_mag = models.FloatField(default=0.0, null=True) + #mean bp magnitude, vega scale + + phot_rp_mean_mag = models.FloatField(default=0.0, null=True) + #mean rp magnitude, vega scale + + + + diff --git a/sample_app/serializers.py b/sample_app/serializers.py new file mode 100644 index 0000000..227d010 --- /dev/null +++ b/sample_app/serializers.py @@ -0,0 +1,7 @@ +from rest_framework import serializers +from .models import GaiaSource + +class GaiaSourceSerializer(serializers.ModelSerializer): + class Meta: + model = GaiaSource + fields = '__all__' diff --git a/sample_app/tests.py b/sample_app/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/sample_app/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/sample_app/urls.py b/sample_app/urls.py new file mode 100644 index 0000000..dc04be1 --- /dev/null +++ b/sample_app/urls.py @@ -0,0 +1,7 @@ +from django.urls import path +from .views import GaiaSourceListCreate, GaiaSourceDetail + +urlpatterns = [ + path('GaiaSource/', GaiaSourceListCreate.as_view(), name='GaiaSource-list-create'), + path('GaiaSource//', GaiaSourceDetail.as_view(), name='GaiaSource-detail'), +] diff --git a/sample_app/views.py b/sample_app/views.py new file mode 100644 index 0000000..b138ed4 --- /dev/null +++ b/sample_app/views.py @@ -0,0 +1,13 @@ +from django.shortcuts import render + +from rest_framework import generics +from .models import GaiaSource +from .serializers import GaiaSourceSerializer + +class GaiaSourceListCreate(generics.ListCreateAPIView): + queryset = GaiaSource.objects.all() + serializer_class = GaiaSourceSerializer + +class GaiaSourceDetail(generics.RetrieveUpdateDestroyAPIView): + queryset = GaiaSource.objects.all() + serializer_class = GaiaSourceSerializer