Skip to content

Commit

Permalink
WiP
Browse files Browse the repository at this point in the history
  • Loading branch information
GeoWill committed Jun 15, 2024
1 parent 20635f3 commit 6d411fb
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 95 deletions.
14 changes: 12 additions & 2 deletions polling_stations/apps/addressbase/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,18 @@ class Meta:
)


def get_uprn_hash_table(gss_code):
addresses = Address.objects.filter(uprntocouncil__lad=gss_code)
def get_uprn_hash_table(gss_codes: list[str]) -> dict[str, dict[str:str]]:
"""
Takes a list of gss codes and returns a dict with shape:
{
<uprn>: {
"address": <address>,
"postcode": <postcode>,
"location": <location>
}
}
"""
addresses = Address.objects.filter(uprntocouncil__lad__in=gss_codes)
# return result a hash table keyed by UPRN
return {
a.uprn: {
Expand Down
9 changes: 5 additions & 4 deletions polling_stations/apps/data_importers/base_importers.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,7 @@ def report(self):
)
station_report = StationReport(self.council.pk, self.additional_report_councils)
district_report = DistrictReport(self.council.pk)
address_report = AddressReport(
self.council.pk, additional_report_councils=self.additional_report_councils
)
address_report = AddressReport(self.council.pk)

report.build_report()

Expand Down Expand Up @@ -860,7 +858,10 @@ def import_data(self):
self.pre_import()

self.stations = StationSet()
self.addresses = AddressList(self.logger)
self.addresses = AddressList(
self.logger, extra_councils=self.additional_report_councils
)

self.import_residential_addresses()
self.import_polling_stations()
self.addresses.check_records()
Expand Down
44 changes: 17 additions & 27 deletions polling_stations/apps/data_importers/data_quality_report.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from addressbase.models import UprnToCouncil
from councils.models import Council
from councils.models import CouncilGeography
from django.db import connection
from django.db.models import Q
from pollingstations.models import PollingDistrict, PollingStation
Expand Down Expand Up @@ -229,26 +229,17 @@ def get_districts_containing_more_stations(self):

# data quality stats for UPRNs assigned polling station ids
class AddressReport:
def __init__(self, council_id, additional_report_councils=None):
if not additional_report_councils:
additional_report_councils = []
self.additional_report_councils = additional_report_councils
def __init__(self, council_id):
self.council_id = council_id
self.councils = self.additional_report_councils + [self.council_id]
self.gss_codes = [
council.geography.gss
for council in Council.objects.filter(pk__in=self.councils).select_related(
"geography"
)
]
self.gss_code = CouncilGeography.objects.get(council_id=self.council_id).gss

def get_uprns_in_addressbase(self):
return UprnToCouncil.objects.filter(lad__in=self.gss_codes).count()
return UprnToCouncil.objects.filter(lad=self.gss_code).count()

def get_addresses_with_station_id(self):
return (
UprnToCouncil.objects.filter(
lad__in=self.gss_codes, polling_station_id__isnull=False
lad=self.gss_code, polling_station_id__isnull=False
)
.exclude(polling_station_id="")
.count()
Expand All @@ -257,7 +248,7 @@ def get_addresses_with_station_id(self):
def get_addresses_without_station_id(self):
return UprnToCouncil.objects.filter(
Q(polling_station_id__isnull=True) | Q(polling_station_id=""),
council_id__in=self.councils,
council_id=self.council_id,
).count()

def get_addresses_with_valid_station_id_ref(self):
Expand All @@ -267,12 +258,12 @@ def get_addresses_with_valid_station_id_ref(self):
SELECT COUNT(*) FROM addressbase_uprntocouncil
WHERE polling_station_id IN
(SELECT internal_council_id FROM pollingstations_pollingstation
WHERE council_id IN %s)
AND lad IN %s
WHERE council_id = %s)
AND lad = %s
AND polling_station_id != ''
AND polling_station_id IS NOT NULL;
""",
[tuple(self.councils), tuple(self.gss_codes)],
[self.council_id, self.gss_code],
)
results = cursor.fetchall()
return results[0][0]
Expand All @@ -284,12 +275,12 @@ def get_addresses_with_invalid_station_id_ref(self):
SELECT COUNT(*) FROM addressbase_uprntocouncil
WHERE polling_station_id NOT IN
(SELECT internal_council_id FROM pollingstations_pollingstation
WHERE council_id IN %s)
AND lad IN %s
WHERE council_id = %s)
AND lad = %s
AND polling_station_id != ''
AND polling_station_id IS NOT NULL;
""",
[tuple(self.councils), tuple(self.gss_codes)],
[self.council_id, self.gss_code],
)
results = cursor.fetchall()
return results[0][0]
Expand Down Expand Up @@ -474,14 +465,12 @@ def get_csv_coverage_row_color(self, station_ids):

return row_color

def build_address_report(self):
table = Table(title="ADDRESSES", show_header=False, min_width=50)
def build_address_report(self, council_id):
table = Table(title=f"{council_id} ADDRESSES", show_header=False, min_width=50)
table.add_column("Caption")
table.add_column("Number", justify="right")

address_report = AddressReport(
self.council_id, additional_report_councils=self.additional_report_councils
)
address_report = AddressReport(council_id)
uprns_in_council_area = address_report.get_uprns_in_addressbase()
addresses_imported = address_report.get_addresses_with_station_id()
station_ids = address_report.get_addresses_with_station_id()
Expand Down Expand Up @@ -538,7 +527,8 @@ def build_report(self):
if self.expecting_districts:
self.report.add_row(self.build_district_report())
self.report.add_row(self.build_station_report())
self.report.add_row(self.build_address_report())
for council_id in [self.council_id] + self.additional_report_councils:
self.report.add_row(self.build_address_report(council_id))

def generate_string_report(self):
recorder = Console(record=True)
Expand Down
91 changes: 80 additions & 11 deletions polling_stations/apps/data_importers/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections import namedtuple

from addressbase.models import Address, UprnToCouncil, get_uprn_hash_table
from councils.models import Council
from councils.models import Council, CouncilGeography
from django.db import connection
from pollingstations.models import PollingDistrict, PollingStation
from uk_geo_utils.helpers import Postcode
Expand Down Expand Up @@ -75,15 +75,9 @@ def council_id(self): # TODO Deal with old_to_new council_ids map
def gss_code(self):
return Council.objects.get(pk=self.council_id).geography.gss

@abc.abstractmethod
def update_uprn_to_council_model(self, polling_station_lookup=None):
if not polling_station_lookup:
polling_station_lookup = self.get_polling_station_lookup()

uprns_in_council = UprnToCouncil.objects.filter(lad=self.gss_code)
for polling_station_id, uprns in polling_station_lookup.items():
uprns_in_council.filter(uprn__in=uprns).update(
polling_station_id=polling_station_id
)
pass


class DistrictSet(CustomSet, AssignPollingStationsMixin):
Expand Down Expand Up @@ -242,9 +236,13 @@ def save(self):


class AddressList(AssignPollingStationsMixin):
def __init__(self, logger):
def __init__(self, logger, extra_councils=None):
if extra_councils is None:
extra_councils = []

self.elements = []
self.logger = logger
self.extra_councils = extra_councils

def append(self, address):
if (
Expand All @@ -263,6 +261,16 @@ def append(self, address):

self.elements.append(address)

@property
def council_ids(self) -> list[str]: # TODO Deal with old_to_new council_ids map
return [self.council_id] + self.extra_councils

@property
def gss_codes(self) -> list[str]:
return CouncilGeography.objects.filter(
council_id__in=self.council_ids
).values_list("gss", flat=True)

def get_uprn_lookup(self):
# for each address, build a lookup of uprn -> set of station ids
uprn_lookup = {}
Expand Down Expand Up @@ -375,11 +383,72 @@ def check_split_postcodes_are_split(self, split_postcodes):
pretty=True,
)

def update_uprn_to_council_model(self, polling_station_lookup=None):
if not polling_station_lookup:
polling_station_lookup = self.get_polling_station_lookup()

uprns_in_council = UprnToCouncil.objects.filter(lad__in=self.gss_codes)
for polling_station_id, uprns in polling_station_lookup.items():
uprns_assigned_to_station = uprns_in_council.filter(uprn__in=uprns)

if self.extra_councils:
self.set_polling_station_for_extra_councils(
polling_station_id, uprns_assigned_to_station
)

uprns_assigned_to_station.filter(uprn__in=uprns).update(
polling_station_id=polling_station_id
)

def set_polling_station_for_extra_councils(
self, polling_station_id, uprns_assigned_to_station
):
# At this stage we want to know if the station has the right council id.
# There are three cases:
# 1. All the addresses are in the council named in the import script.
# So the station will have the correct council_id and no action is necessary.
# 2. All the addresses are in a different council.
# In this case we need to update the council_id on the polling station in the pollingstations table
# 3. The addresses assigned to this station are in different council areas.
# In this case we need to duplicate the station making sure there are a record for each council_id.
gss_codes = uprns_assigned_to_station.values_list("lad", flat=True)
council_ids = CouncilGeography.objects.filter(gss__in=gss_codes).values_list(
"council_id", flat=True
)
if len(council_ids) == 1 and council_ids[0] == self.council_id:
# Case 1 - no-op
pass
if len(council_ids) == 1 and council_ids[0] != self.council_id:
# Case 2 - change council id on station
station = PollingStation.objects.get(
internal_council_id=polling_station_id,
council_id=self.council_id,
)
station.council_id = council_ids[0]
station.save()
if len(council_ids) > 1:
# Case 3 - create a station for each council
for council_id in council_ids:
try:
PollingStation.objects.get(
council_id=council_id,
internal_council_id=polling_station_id,
)
except PollingStation.DoesNotExist:
existing_station = PollingStation.objects.get(
council_id=self.council_id,
internal_council_id=polling_station_id,
)
existing_station.id = None
existing_station.council_id = council_id
existing_station._state.adding = True
existing_station.save()

def check_records(self):
split_postcodes = self.get_council_split_postcodes()
self.remove_records_missing_uprns()
self.remove_duplicate_uprns()
addressbase_data = get_uprn_hash_table(self.gss_code)
addressbase_data = get_uprn_hash_table(self.gss_codes)
self.remove_records_not_in_addressbase(addressbase_data)
self.remove_records_that_dont_match_addressbase(addressbase_data)
self.check_split_postcodes_are_split(split_postcodes)
Original file line number Diff line number Diff line change
@@ -1,56 +1,14 @@
from addressbase.models import UprnToCouncil
from data_importers.management.commands import BaseHalaroseCsvImporter


class Command(BaseHalaroseCsvImporter):
council_id = "ABD"
addresses_name = "2022-05-05/2022-04-12T10:11:25.128402/polling_station_export-2022-04-07.edited.csv"
stations_name = "2022-05-05/2022-04-12T10:11:25.128402/polling_station_export-2022-04-07.edited.csv"
elections = ["2022-05-05"]

def pre_import(self):
# We need to consider rows that don't have a uprn when importing data.
# However there are lots of rows for other councils in this file.
# So build a list of stations from rows that do have UPRNS
# and then use that list of stations to make sure we check relevant rows, even if they don't have a UPRN

council_uprns = set(
UprnToCouncil.objects.filter(lad=self.council.geography.gss).values_list(
"uprn", flat=True
)
)
self.COUNCIL_STATIONS = set()
data = self.get_addresses()

for record in data:
if record.uprn in council_uprns:
self.COUNCIL_STATIONS.add(self.get_station_hash(record))

def address_record_to_dict(self, record):
if self.get_station_hash(record) not in self.COUNCIL_STATIONS:
return None

if record.housepostcode in [
"AB39 2UJ",
"AB30 1SL",
"AB43 7LN",
"AB42 5JB",
"AB51 8XH",
"AB41 7UA",
"AB51 5DU",
"AB21 0QJ",
"AB35 5PR",
]:
return None

return super().address_record_to_dict(record)

def station_record_to_dict(self, record):
station_hash = self.get_station_hash(record)
if station_hash not in self.COUNCIL_STATIONS:
return None

if station_hash == "74-hanover-community-centre":
return None

return super().station_record_to_dict(record)
addresses_name = (
"2024-07-04/2024-06-07T15:42:14.722645/Eros_SQL_Output002 - Aberdeenshire.csv"
)
stations_name = (
"2024-07-04/2024-06-07T15:42:14.722645/Eros_SQL_Output002 - Aberdeenshire.csv"
)
elections = ["2024-07-04"]

additional_report_councils = ["MRY"]

0 comments on commit 6d411fb

Please sign in to comment.