From 47950ea84c0128d6b7cb61f74da47610f67a78e4 Mon Sep 17 00:00:00 2001 From: GeoWill Date: Thu, 13 Jun 2024 12:14:18 +0100 Subject: [PATCH] WiP --- polling_stations/apps/addressbase/models.py | 14 +++- .../apps/data_importers/base_importers.py | 9 +- .../data_importers/data_quality_report.py | 44 ++++------ .../apps/data_importers/data_types.py | 84 ++++++++++++++++--- .../commands/import_aberdeenshire.py | 60 ++----------- 5 files changed, 116 insertions(+), 95 deletions(-) diff --git a/polling_stations/apps/addressbase/models.py b/polling_stations/apps/addressbase/models.py index 9a90736c70..0e73a299a6 100644 --- a/polling_stations/apps/addressbase/models.py +++ b/polling_stations/apps/addressbase/models.py @@ -131,8 +131,18 @@ class Meta: ) -def get_uprn_hash_table(gss_code): - addresses = Address.objects.filter(uprntocouncil__lad=gss_code) +def get_uprn_hash_table(gss_codes: list[str]) -> dict[str, dict[str:str]]: + """ + Takes a list of gss codes and returns a dict with shape: + { + : { + "address":
, + "postcode": , + "location": + } + } + """ + addresses = Address.objects.filter(uprntocouncil__lad__in=gss_codes) # return result a hash table keyed by UPRN return { a.uprn: { diff --git a/polling_stations/apps/data_importers/base_importers.py b/polling_stations/apps/data_importers/base_importers.py index b921239d32..b097c59bbd 100644 --- a/polling_stations/apps/data_importers/base_importers.py +++ b/polling_stations/apps/data_importers/base_importers.py @@ -175,9 +175,7 @@ def report(self): ) station_report = StationReport(self.council.pk, self.additional_report_councils) district_report = DistrictReport(self.council.pk) - address_report = AddressReport( - self.council.pk, additional_report_councils=self.additional_report_councils - ) + address_report = AddressReport(self.council.pk) report.build_report() @@ -860,7 +858,10 @@ def import_data(self): self.pre_import() self.stations = StationSet() - self.addresses = AddressList(self.logger) + self.addresses = AddressList( + self.logger, extra_councils=self.additional_report_councils + ) + self.import_residential_addresses() self.import_polling_stations() self.addresses.check_records() diff --git a/polling_stations/apps/data_importers/data_quality_report.py b/polling_stations/apps/data_importers/data_quality_report.py index 9c4abeccf4..7c057d5599 100644 --- a/polling_stations/apps/data_importers/data_quality_report.py +++ b/polling_stations/apps/data_importers/data_quality_report.py @@ -1,5 +1,5 @@ from addressbase.models import UprnToCouncil -from councils.models import Council +from councils.models import CouncilGeography from django.db import connection from django.db.models import Q from pollingstations.models import PollingDistrict, PollingStation @@ -229,26 +229,17 @@ def get_districts_containing_more_stations(self): # data quality stats for UPRNs assigned polling station ids class AddressReport: - def __init__(self, council_id, additional_report_councils=None): - if not additional_report_councils: - additional_report_councils = [] - self.additional_report_councils = additional_report_councils + def __init__(self, council_id): self.council_id = council_id - self.councils = self.additional_report_councils + [self.council_id] - self.gss_codes = [ - council.geography.gss - for council in Council.objects.filter(pk__in=self.councils).select_related( - "geography" - ) - ] + self.gss_code = CouncilGeography.objects.get(council_id=self.council_id).gss def get_uprns_in_addressbase(self): - return UprnToCouncil.objects.filter(lad__in=self.gss_codes).count() + return UprnToCouncil.objects.filter(lad=self.gss_code).count() def get_addresses_with_station_id(self): return ( UprnToCouncil.objects.filter( - lad__in=self.gss_codes, polling_station_id__isnull=False + lad=self.gss_code, polling_station_id__isnull=False ) .exclude(polling_station_id="") .count() @@ -257,7 +248,7 @@ def get_addresses_with_station_id(self): def get_addresses_without_station_id(self): return UprnToCouncil.objects.filter( Q(polling_station_id__isnull=True) | Q(polling_station_id=""), - council_id__in=self.councils, + council_id=self.council_id, ).count() def get_addresses_with_valid_station_id_ref(self): @@ -267,12 +258,12 @@ def get_addresses_with_valid_station_id_ref(self): SELECT COUNT(*) FROM addressbase_uprntocouncil WHERE polling_station_id IN (SELECT internal_council_id FROM pollingstations_pollingstation - WHERE council_id IN %s) - AND lad IN %s + WHERE council_id = %s) + AND lad = %s AND polling_station_id != '' AND polling_station_id IS NOT NULL; """, - [tuple(self.councils), tuple(self.gss_codes)], + [self.council_id, self.gss_code], ) results = cursor.fetchall() return results[0][0] @@ -284,12 +275,12 @@ def get_addresses_with_invalid_station_id_ref(self): SELECT COUNT(*) FROM addressbase_uprntocouncil WHERE polling_station_id NOT IN (SELECT internal_council_id FROM pollingstations_pollingstation - WHERE council_id IN %s) - AND lad IN %s + WHERE council_id = %s) + AND lad = %s AND polling_station_id != '' AND polling_station_id IS NOT NULL; """, - [tuple(self.councils), tuple(self.gss_codes)], + [self.council_id, self.gss_code], ) results = cursor.fetchall() return results[0][0] @@ -474,14 +465,12 @@ def get_csv_coverage_row_color(self, station_ids): return row_color - def build_address_report(self): - table = Table(title="ADDRESSES", show_header=False, min_width=50) + def build_address_report(self, council_id): + table = Table(title=f"{council_id} ADDRESSES", show_header=False, min_width=50) table.add_column("Caption") table.add_column("Number", justify="right") - address_report = AddressReport( - self.council_id, additional_report_councils=self.additional_report_councils - ) + address_report = AddressReport(council_id) uprns_in_council_area = address_report.get_uprns_in_addressbase() addresses_imported = address_report.get_addresses_with_station_id() station_ids = address_report.get_addresses_with_station_id() @@ -538,7 +527,8 @@ def build_report(self): if self.expecting_districts: self.report.add_row(self.build_district_report()) self.report.add_row(self.build_station_report()) - self.report.add_row(self.build_address_report()) + for council_id in [self.council_id] + self.additional_report_councils: + self.report.add_row(self.build_address_report(council_id)) def generate_string_report(self): recorder = Console(record=True) diff --git a/polling_stations/apps/data_importers/data_types.py b/polling_stations/apps/data_importers/data_types.py index 93c1960dd0..5a9d43b2fc 100644 --- a/polling_stations/apps/data_importers/data_types.py +++ b/polling_stations/apps/data_importers/data_types.py @@ -7,7 +7,7 @@ from collections import namedtuple from addressbase.models import Address, UprnToCouncil, get_uprn_hash_table -from councils.models import Council +from councils.models import Council, CouncilGeography from django.db import connection from pollingstations.models import PollingDistrict, PollingStation from uk_geo_utils.helpers import Postcode @@ -75,15 +75,9 @@ def council_id(self): # TODO Deal with old_to_new council_ids map def gss_code(self): return Council.objects.get(pk=self.council_id).geography.gss + @abc.abstractmethod def update_uprn_to_council_model(self, polling_station_lookup=None): - if not polling_station_lookup: - polling_station_lookup = self.get_polling_station_lookup() - - uprns_in_council = UprnToCouncil.objects.filter(lad=self.gss_code) - for polling_station_id, uprns in polling_station_lookup.items(): - uprns_in_council.filter(uprn__in=uprns).update( - polling_station_id=polling_station_id - ) + pass class DistrictSet(CustomSet, AssignPollingStationsMixin): @@ -242,9 +236,13 @@ def save(self): class AddressList(AssignPollingStationsMixin): - def __init__(self, logger): + def __init__(self, logger, extra_councils=None): + if extra_councils is None: + extra_councils = [] + self.elements = [] self.logger = logger + self.extra_councils = extra_councils def append(self, address): if ( @@ -263,6 +261,16 @@ def append(self, address): self.elements.append(address) + @property + def council_ids(self) -> list[str]: # TODO Deal with old_to_new council_ids map + return [self.council_id] + self.extra_councils + + @property + def gss_codes(self) -> list[str]: + return CouncilGeography.objects.filter( + council_id__in=self.council_ids + ).values_list("gss", flat=True) + def get_uprn_lookup(self): # for each address, build a lookup of uprn -> set of station ids uprn_lookup = {} @@ -375,11 +383,65 @@ def check_split_postcodes_are_split(self, split_postcodes): pretty=True, ) + def update_uprn_to_council_model(self, polling_station_lookup=None): + if not polling_station_lookup: + polling_station_lookup = self.get_polling_station_lookup() + + uprns_in_council = UprnToCouncil.objects.filter(lad__in=self.gss_codes) + for polling_station_id, uprns in polling_station_lookup.items(): + uprns_assigned_to_station = uprns_in_council.filter(uprn__in=uprns) + + if self.extra_councils: + # At this stage we want to know if the station has the right council id. + # There are three cases: + # 1. All the addresses are in the council named in the import script. + # So the station will have the correct council_id and no action is necessary. + # 2. All the addresses are in a different council. + # In this case we need to update the council_id on the polling station in the pollingstations table + # 3. The addresses assigned to this station are in different council areas. + # In this case we need to duplicate the station making sure there are a record for each council_id. + gss_codes = uprns_assigned_to_station.values_list("lad", flat=True) + council_ids = CouncilGeography.objects.filter( + gss__in=gss_codes + ).values_list("council_id", flat=True) + if len(council_ids) == 1 and council_ids[0] == self.council_id: + # Case 1 + pass + if len(council_ids) == 1 and council_ids[0] != self.council_id: + # Case 2 + station = PollingStation.objects.get( + internal_council_id=polling_station_id, + council_id=self.council_id, + ) + station.council_id = council_ids[0] + station.save() + if len(council_ids) > 1: + # Case 3 + for council_id in council_ids: + try: + PollingStation.objects.get( + council_id=council_id, + internal_council_id=polling_station_id, + ) + except PollingStation.DoesNotExist: + existing_station = PollingStation.objects.get( + council_id=self.council_id, + internal_council_id=polling_station_id, + ) + existing_station.id = None + existing_station.council_id = council_id + existing_station._state.adding = True + existing_station.save() + + uprns_assigned_to_station.filter(uprn__in=uprns).update( + polling_station_id=polling_station_id + ) + def check_records(self): split_postcodes = self.get_council_split_postcodes() self.remove_records_missing_uprns() self.remove_duplicate_uprns() - addressbase_data = get_uprn_hash_table(self.gss_code) + addressbase_data = get_uprn_hash_table(self.gss_codes) self.remove_records_not_in_addressbase(addressbase_data) self.remove_records_that_dont_match_addressbase(addressbase_data) self.check_split_postcodes_are_split(split_postcodes) diff --git a/polling_stations/apps/data_importers/management/commands/import_aberdeenshire.py b/polling_stations/apps/data_importers/management/commands/import_aberdeenshire.py index a2f466a78e..0d7c1b7b24 100644 --- a/polling_stations/apps/data_importers/management/commands/import_aberdeenshire.py +++ b/polling_stations/apps/data_importers/management/commands/import_aberdeenshire.py @@ -1,56 +1,14 @@ -from addressbase.models import UprnToCouncil from data_importers.management.commands import BaseHalaroseCsvImporter class Command(BaseHalaroseCsvImporter): council_id = "ABD" - addresses_name = "2022-05-05/2022-04-12T10:11:25.128402/polling_station_export-2022-04-07.edited.csv" - stations_name = "2022-05-05/2022-04-12T10:11:25.128402/polling_station_export-2022-04-07.edited.csv" - elections = ["2022-05-05"] - - def pre_import(self): - # We need to consider rows that don't have a uprn when importing data. - # However there are lots of rows for other councils in this file. - # So build a list of stations from rows that do have UPRNS - # and then use that list of stations to make sure we check relevant rows, even if they don't have a UPRN - - council_uprns = set( - UprnToCouncil.objects.filter(lad=self.council.geography.gss).values_list( - "uprn", flat=True - ) - ) - self.COUNCIL_STATIONS = set() - data = self.get_addresses() - - for record in data: - if record.uprn in council_uprns: - self.COUNCIL_STATIONS.add(self.get_station_hash(record)) - - def address_record_to_dict(self, record): - if self.get_station_hash(record) not in self.COUNCIL_STATIONS: - return None - - if record.housepostcode in [ - "AB39 2UJ", - "AB30 1SL", - "AB43 7LN", - "AB42 5JB", - "AB51 8XH", - "AB41 7UA", - "AB51 5DU", - "AB21 0QJ", - "AB35 5PR", - ]: - return None - - return super().address_record_to_dict(record) - - def station_record_to_dict(self, record): - station_hash = self.get_station_hash(record) - if station_hash not in self.COUNCIL_STATIONS: - return None - - if station_hash == "74-hanover-community-centre": - return None - - return super().station_record_to_dict(record) + addresses_name = ( + "2024-07-04/2024-06-07T15:42:14.722645/Eros_SQL_Output002 - Aberdeenshire.csv" + ) + stations_name = ( + "2024-07-04/2024-06-07T15:42:14.722645/Eros_SQL_Output002 - Aberdeenshire.csv" + ) + elections = ["2024-07-04"] + + # additional_report_councils = ["MRY"]