uplim/management/commands/load_survey.py

# uplim/management/commands/load_survey.py

import numpy as np
from astropy.io import fits

from django.core.management.base import BaseCommand
from django.db import transaction
from uplim.models import Pixel, Survey
from django.db.models import Max

from itertools import islice

from datetime import datetime

# DEFINE BATCH SIZE AND BATCH
# **************************************************************

#BATCH_SIZE = 1000000

def batch(iterable, size):
    """
    Generator that yields successive chunks of size 'size' from 'iterable'.
    """
    iterable = iter(iterable)
    while True:
        chunk = list(islice(iterable, size))
        if not chunk:
            break
        yield chunk


class Command(BaseCommand):
    help = "Process FITS files and store the data in the database"

    # COMMAND LINE ARGUMENTS
    # **************************************************************

    def add_arguments(self, parser):
        parser.add_argument(
            '--counts',
            type=str,
            required=True,
            help='Path of the counts file'
        )
        parser.add_argument(
            '--exposure',
            type=str,
            required=True,
            help='Path of the exposure file'
        )
        parser.add_argument(
            '--survey_number',
            type=int,
            required=True,
            help='Integer ID of the survey being read'
        )
        parser.add_argument(
            '--batch_size',
            type=int,
            default=1000,
            help='Integer number of pixels to be inserted into the database at once'
        )


    def handle(self, *args, **options):

        # GET FILENAMES FROM ARGUMENTS
        # **************************************************************

        counts_file = options['counts']
        exposure_file = options['exposure']
        survey_number = options['survey_number']
        BATCH_SIZE = options['batch_size']

        self.stdout.write(f"\nCounts file:\t{counts_file}")
        self.stdout.write(f"Exposure file:\t{exposure_file}")

        # OPEN BOTH FILES, RAVEL EACH
        # **************************************************************

        with fits.open(counts_file) as hdul:

            column_name = "T"
            counts_map = hdul[1].data[column_name]

        counts_data = counts_map.ravel()


        with fits.open(exposure_file) as hdul:

            column_name = "T"
            exposure_map = hdul[1].data[column_name]

        exposure_data = exposure_map.ravel()

        # COMPARE DATA SHAPES, ENSURE THEY'RE THE SAME
        # **************************************************************

        self.stdout.write(f"\nCounts Data Shape:\t{counts_data.shape}")
        self.stdout.write(f"Exposure Data Shape:\t{exposure_data.shape}")

        total_pixels = counts_data.shape[0]
        self.stdout.write(f"\nTotal pixels to insert:\t{total_pixels}")

        assert counts_data.shape == exposure_data.shape, "Counts and exposure maps must have the same shape"

        # CREATE THE SURVEY IF IT DOES NOT EXIST
        # **************************************************************

        with transaction.atomic():

            survey,created = Survey.objects.get_or_create(number=survey_number)

            if created:
                self.stdout.write(f"Created a new survey instance with number: {survey.number}")
            else:
                self.stdout.write(f"Using existing survey instance with the number: {survey.number}")

        # FETCH THE LAST PROCESSED HPID AND CONTINUE FROM IT
        # **************************************************************

        last_hpid = (
            Pixel.objects
                .filter(survey=survey)
                .aggregate(max_hpid=Max('hpid'))['max_hpid']
            or -1
        )
        start_index = last_hpid + 1

        pixel_generator = (
            Pixel(
                hpid=i,
                counts=int(count),
                exposure=float(exposure),
                survey=survey
            )
            for i, (count, exposure) in enumerate(zip(counts_data, exposure_data))
            if i >= start_index
        )


        total_inserted = start_index
        # Process in batches
        for pixel_batch in batch(pixel_generator, BATCH_SIZE):
            with transaction.atomic():
                Pixel.objects.bulk_create(pixel_batch)
            total_inserted += len(pixel_batch)
            percentage = total_inserted / total_pixels * 100
            timestamp = datetime.now().strftime("%H:%M:%S")
            self.stdout.write(
                f"[{timestamp}] {percentage:.2f}% inserted"
            )

        self.stdout.write(f"Inserted a total of {total_inserted} pixels.")