Indexer: made code more legible, added some comments

This commit is contained in:
Никита Тырин 2024-09-16 10:44:40 +03:00
parent 862efc5cbd
commit c122b271e9

View File

@ -11,32 +11,39 @@ from datetime import datetime, timedelta
import gc import gc
import json import json
MAX_WORKERS = 2 * mp.cpu_count() - 1 MAX_WORKERS = 2 * mp.cpu_count() - 1 #max number of workers for multiprocessing
CHUNK_SIZE = 1_000 CHUNK_SIZE = 1_000 #size of individual chunk
def current_time(): def current_time(): #a function for timestamps
return (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S") return (datetime.now() + timedelta(hours=3)).strftime("%H:%M:%S")
def update_catalog_file_status(catalog_file, status): def update_catalog_file_status(catalog_file, status): #file status updater function
catalog_file.status = status catalog_file.status = status
catalog_file.save() catalog_file.save()
def healpix(): def healpix():
interrupted_files = CatalogFile.objects.filter(status='INDEX_IN_PROGRESS')
interrupted_files = CatalogFile.objects.filter(status='INDEX_IN_PROGRESS')
total_interrupted = interrupted_files.count() total_interrupted = interrupted_files.count()
print(f'[{current_time()}] Found {total_interrupted} files interrupted mid-index. Cleaning up.') print(f'[{current_time()}] Found {total_interrupted} files interrupted mid-index. Cleaning up.')
for interrupted_file in interrupted_files: for interrupted_file in interrupted_files: #handling apparent interrupted files
GaiaSource.objects.filter(catalog_file=interrupted_file).update( GaiaSource.objects.filter(catalog_file=interrupted_file).update(
healpix_ring_index=None, healpix_ring_index=None,
healpix_nested_index=None healpix_nested_index=None
) )
update_catalog_file_status(interrupted_file, 'INGESTED') update_catalog_file_status(interrupted_file, 'INGESTED')
ingested_files = CatalogFile.objects.filter(status='INGESTED')
ingested_files = CatalogFile.objects.filter(status='INGESTED') #find files ready for indexing
total_files = ingested_files.count() total_files = ingested_files.count()
print(f'[{current_time()}] There are {total_files} ingested files.') print(f'[{current_time()}] There are {total_files} ingested files.')
current_file_index = 0
if os.path.exists('config.json'): if os.path.exists('config.json'):
with open('config.json', 'r') as config_file: with open('config.json', 'r') as config_file:
@ -54,7 +61,13 @@ def healpix():
print(f"[{current_time()}] Selected nside: {nside}") print(f"[{current_time()}] Selected nside: {nside}")
current_file_index = 0
for catalog_file in ingested_files: for catalog_file in ingested_files:
current_file_index += 1 current_file_index += 1
print(f'[{current_time()}] Processing sources from {catalog_file.name}') print(f'[{current_time()}] Processing sources from {catalog_file.name}')
sources = list(catalog_file.sources.all()) sources = list(catalog_file.sources.all())
@ -70,6 +83,8 @@ def healpix():
frame='fk5' frame='fk5'
) )
ring_healpix = ah.HEALPix( ring_healpix = ah.HEALPix(
nside=nside, nside=nside,
order='ring', order='ring',
@ -77,6 +92,11 @@ def healpix():
) )
ring_healpix_indices = ring_healpix.skycoord_to_healpix(skycoord) ring_healpix_indices = ring_healpix.skycoord_to_healpix(skycoord)
for source, healpix_index in zip(sources, ring_healpix_indices):
source.healpix_ring_index = healpix_index
nested_healpix = ah.HEALPix( nested_healpix = ah.HEALPix(
nside=nside, nside=nside,
order='nested', order='nested',
@ -84,23 +104,19 @@ def healpix():
) )
nested_healpix_indices = nested_healpix.skycoord_to_healpix(skycoord) nested_healpix_indices = nested_healpix.skycoord_to_healpix(skycoord)
for source, healpix_index in zip(sources, ring_healpix_indices):
source.healpix_ring_index = healpix_index
for source, healpix_index in zip(sources, nested_healpix_indices): for source, healpix_index in zip(sources, nested_healpix_indices):
source.healpix_nested_index = healpix_index source.healpix_nested_index = healpix_index
print(f'[{current_time()}] Calculations done, updating database.') print(f'[{current_time()}] Calculations done, updating database.')
update_catalog_file_status(catalog_file, 'INDEX_IN_PROGRESS') update_catalog_file_status(catalog_file, 'INDEX_IN_PROGRESS')
source_chunks = [sources[i:i + CHUNK_SIZE] for i in range(0, len(sources), CHUNK_SIZE)] source_chunks = [sources[i:i + CHUNK_SIZE] for i in range(0, len(sources), CHUNK_SIZE)]
django.db.connections.close_all() django.db.connections.close_all()
with mp.Pool(MAX_WORKERS) as pool: with mp.Pool(MAX_WORKERS) as pool:
pool.map(update_sources, source_chunks, chunksize=10) pool.map(update_sources, source_chunks, chunksize=10)