reorganization

This commit is contained in:
Никита Тырин 2024-09-10 14:28:57 +03:00
parent 0f1088a066
commit 3a9ba2f320
8 changed files with 119 additions and 177 deletions

View File

@ -20,7 +20,7 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/ # See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret! # SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-k_$r^3e!9ycqnt0=+ur&sx#hsl44_+v3=al2$_gpnh3u^w!xj$' SECRET_KEY = 'django-insecure-m!o^q^+en&_v%64&m8%d^%_olkzf7$8jbp0^4dph2=1rn=666m'
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True DEBUG = True
@ -31,7 +31,6 @@ ALLOWED_HOSTS = []
# Application definition # Application definition
INSTALLED_APPS = [ INSTALLED_APPS = [
#'rest_framework',
'sample_app', 'sample_app',
'django.contrib.admin', 'django.contrib.admin',
'django.contrib.auth', 'django.contrib.auth',

View File

@ -15,10 +15,8 @@ Including another URLconf
2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
""" """
from django.contrib import admin from django.contrib import admin
from django.urls import path, include from django.urls import path
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
path('api/', include('sample_app.urls')), # Include your app's URLs
] ]

View File

@ -4,40 +4,27 @@ import glob
import uuid import uuid
import asyncio import asyncio
from datetime import datetime, timedelta from datetime import datetime, timedelta
import pandas as pd
import django import django
from asgiref.sync import sync_to_async from asgiref.sync import sync_to_async
from django.core.management.base import BaseCommand
import pandas as pd
# #environment init for django
# current_dir = os.getcwd()
# relative_path = os.path.join(current_dir, 'gaia_orm')
# os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gaia_orm.settings')
# sys.path.append(os.path.normpath(relative_path))
# #sys.path.append('/home/kityr/practiceproject/gaia_orm')
# django.setup()
#import models for both the sources and the files
from sample_app.models import GaiaSource, CatalogFile from sample_app.models import GaiaSource, CatalogFile
class Command(BaseCommand):
help = 'Ingest CSV files into the database'
def handle(self, *args, **options):
#fetching the file list #fetching the file list
directory = input("Please enter the path to the directory containing the csv files (or leave empty if the files are in the same directory as this script): ") directory = input("Please enter the path to the directory containing the csv files (or leave empty if the files are in the same directory as this script): ")
csv_files = glob.glob(os.path.join(directory, '*csv*')) csv_files = glob.glob(os.path.join(directory, '*csv*'))
print(f"Files found: {len(csv_files)}.") self.stdout.write(f"Files found: {len(csv_files)}.")
print("Populating the file database...") self.stdout.write("Populating the file database...")
#initialize the counter #initialize the counter
new_files_count = 0 new_files_count = 0
#add files as catalogfile instances into the database #add files as catalogfile instances into the database
for file_path in csv_files: for file_path in csv_files:
file_name = os.path.basename(file_path) file_name = os.path.basename(file_path)
#use get_or_create to not add files twice #use get_or_create to not add files twice
catalog_file, created = CatalogFile.objects.get_or_create( catalog_file, created = CatalogFile.objects.get_or_create(
@ -47,44 +34,44 @@ for file_path in csv_files:
if created: if created:
new_files_count += 1 new_files_count += 1
#show how many duplicates were already in db #show how many duplicates were already in db
print(f"File database populated. {len(csv_files) - new_files_count} were already in the database.") self.stdout.write(f"File database populated. {len(csv_files) - new_files_count} were already in the database.")
input("Press Enter to continue...") input("Press Enter to continue...")
#bulk creation function #bulk creation function
@sync_to_async @sync_to_async
def bulk_create_gaia_sources(instances): #catalog status update to ingested after ingesting the sources def bulk_create_gaia_sources(instances): #catalog status update to ingested after ingesting the sources
GaiaSource.objects.bulk_create(instances) GaiaSource.objects.bulk_create(instances)
@sync_to_async @sync_to_async
def update_catalog_file_status(catalog_file, status): #catalog file status updater def update_catalog_file_status(catalog_file, status): #catalog file status updater
catalog_file.status = status catalog_file.status = status
catalog_file.save() catalog_file.save()
@sync_to_async @sync_to_async
def get_all_catalog_files(): #catalog file list getter def get_all_catalog_files(): #catalog file list getter
return list(CatalogFile.objects.all()) return list(CatalogFile.objects.all())
@sync_to_async @sync_to_async
def delete_gaiasources_for_catalogfile(catalog_file): #for deleting the sources from partially ingested files in case of improper shutdown def delete_gaiasources_for_catalogfile(catalog_file): #for deleting the sources from partially ingested files in case of improper shutdown
GaiaSource.objects.filter(catalog_file=catalog_file).delete() GaiaSource.objects.filter(catalog_file=catalog_file).delete()
@sync_to_async @sync_to_async
def count_ingested_files(): def count_ingested_files():
return CatalogFile.objects.filter(status='INGESTED').count() return CatalogFile.objects.filter(status='INGESTED').count()
current_time = (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S") current_time = (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S")
print(f"[{current_time}] Starting the data ingestion.") self.stdout.write(f"[{current_time}] Starting the data ingestion.")
#function that iterates over all catalog files and ingests sources from them #function that iterates over all catalog files and ingests sources from them
async def ingest_files(): async def ingest_files():
#catalog_files = CatalogFile.objects.all() #catalog_files = CatalogFile.objects.all()
@ -93,17 +80,18 @@ async def ingest_files():
for catalog_file in catalog_files: for catalog_file in catalog_files:
if catalog_file.status == 'INGESTED': if catalog_file.status == 'INGESTED':
print(f"Skipping {catalog_file.name} as it is already ingested.") self.stdout.write(f"Skipping {catalog_file.name} as it is already ingested.")
continue continue
file_path = os.path.join(directory, catalog_file.name) file_path = os.path.join(directory, catalog_file.name)
#print(file_path) #self.stdout.write(file_path)
if os.path.exists(file_path): #check if the file exists at all just in case if os.path.exists(file_path): #check if the file exists at all just in case
pass
if catalog_file.status == 'IN_PROGRESS': if catalog_file.status == 'IN_PROGRESS':
print(f"{catalog_file.name} seems to have been interrupted, starting over.")#if the file status is in_progress, it must've self.stdout.write(f"{catalog_file.name} seems to have been interrupted, starting over.")#if the file status is in_progress, it must've
await delete_gaiasources_for_catalogfile(catalog_file) #been interrupted mid-ingest, delete all the await delete_gaiasources_for_catalogfile(catalog_file) #been interrupted mid-ingest, delete all the
#associated sources if there are any and start over #associated sources if there are any and start over
@ -136,9 +124,9 @@ async def ingest_files():
ingested_files_count = await count_ingested_files() ingested_files_count = await count_ingested_files()
current_time = (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S") #Timestamp and progress print statement current_time = (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S") #Timestamp and progress self.stdout.write statement
print(f"[{current_time}] {ingested_files_count}/{len(catalog_files)}") self.stdout.write(f"[{current_time}] {ingested_files_count}/{len(catalog_files)}")
asyncio.run(ingest_files()) asyncio.run(ingest_files())

View File

@ -1,4 +1,4 @@
# Generated by Django 5.1.1 on 2024-09-09 12:01 # Generated by Django 5.1.1 on 2024-09-10 11:15
import django.db.models.deletion import django.db.models.deletion
import uuid import uuid
@ -18,7 +18,7 @@ class Migration(migrations.Migration):
fields=[ fields=[
('uuid', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=32)), ('name', models.CharField(max_length=32)),
('status', models.CharField(choices=[('PENDING', 'Pending'), ('INGESTED', 'Ingested'), ('INDEXED', 'Indexed')], default='PENDING', max_length=10)), ('status', models.CharField(choices=[('PENDING', 'Pending'), ('IN_PROGRESS', 'In Progress'), ('INGESTED', 'Ingested'), ('INDEXED', 'Indexed')], default='PENDING', max_length=11)),
], ],
), ),
migrations.CreateModel( migrations.CreateModel(

View File

@ -1,18 +0,0 @@
# Generated by Django 5.1.1 on 2024-09-09 14:26
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('sample_app', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='catalogfile',
name='status',
field=models.CharField(choices=[('PENDING', 'Pending'), ('IN_PROGRESS', 'In Progress'), ('INGESTED', 'Ingested'), ('INDEXED', 'Indexed')], default='PENDING', max_length=11),
),
]

View File

@ -1,7 +0,0 @@
from rest_framework import serializers
from .models import GaiaSource
class GaiaSourceSerializer(serializers.ModelSerializer):
class Meta:
model = GaiaSource
fields = '__all__'

View File

@ -1,7 +1 @@
from django.urls import path
from .views import GaiaSourceListCreate, GaiaSourceDetail
urlpatterns = [
path('GaiaSource/', GaiaSourceListCreate.as_view(), name='GaiaSource-list-create'),
path('GaiaSource/<int:pk>/', GaiaSourceDetail.as_view(), name='GaiaSource-detail'),
]

View File

@ -1,13 +1 @@
from django.shortcuts import render from django.shortcuts import render
from rest_framework import generics
from .models import GaiaSource
from .serializers import GaiaSourceSerializer
class GaiaSourceListCreate(generics.ListCreateAPIView):
queryset = GaiaSource.objects.all()
serializer_class = GaiaSourceSerializer
class GaiaSourceDetail(generics.RetrieveUpdateDestroyAPIView):
queryset = GaiaSource.objects.all()
serializer_class = GaiaSourceSerializer