distribution_fit_utils

   1from csv import QUOTE_ALL
   2import glob
   3import math
   4import os
   5import sys
   6import numpy as np
   7import pandas as pd
   8import json
   9import itertools
  10from fitter import Fitter, get_common_distributions
  11from datetime import timedelta
  12from utils import Utils
  13from des_parallel_process import parallelProcessJoblib, collateRunResults, removeExistingResults
  14from datetime import datetime
  15import visualisation._job_outcome_calculation as _job_outcome_calculation
  16
  17os.chdir(os.path.dirname(os.path.realpath(__file__)))
  18sys.path.append(os.path.dirname(os.path.realpath(__file__)))
  19
  20
  21class DistributionFitUtils():
  22    """
  23        # The DistributionFitUtils classa
  24
  25        This class will import a CSV, undertake some light
  26        wrangling and then determine distributions and probabilities required
  27        for the Discrete Event Simulation
  28
  29        example usage:
  30            my_data = DistributionFitUtils('data/my_data.csv')
  31            my_data.import_and_wrangle()
  32
  33    """
  34
  35    def __init__(self, file_path: str, calculate_school_holidays = False, school_holidays_years = 0):
  36
  37        self.file_path = file_path
  38        self.df = pd.DataFrame()
  39
  40        # The number of additional years of school holidays
  41        # that will be calculated over that maximum date in the provided dataset
  42        self.school_holidays_years = school_holidays_years
  43        self.calculate_school_holidays = calculate_school_holidays
  44
  45        self.times_to_fit = [
  46            {"hems_result": "Patient Treated but not conveyed by HEMS",
  47            "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']},
  48            {"hems_result": "Patient Conveyed by HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']},
  49            {"hems_result": "Patient Conveyed by land with HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']},
  50            {"hems_result": "Stand Down" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_clear']},
  51            {"hems_result": "Landed but no patient contact" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']},
  52        ]
  53
  54        self.sim_tools_distr_plus = [
  55            "poisson",
  56            "bernoulli",
  57            "triang",
  58            "erlang",
  59            "weibull_min",
  60            "expon_weib",
  61            "betabinom",
  62            "pearson3",
  63            "cauchy",
  64            "chi2",
  65            "expon",
  66            "exponpow",
  67            "gamma",
  68            "lognorm",
  69            "norm",
  70            "powerlaw",
  71            "rayleigh",
  72            "uniform",
  73            "neg_binomial",
  74            "zip"
  75        ]
  76        # SR 16-04-2025 Have hardcoded the common distributions
  77        # to make setup for random number generation more robust
  78        #+ get_common_distributions()
  79
  80    def removeExistingResults(self, folder: str) -> None:
  81            """
  82                Removes results from previous fitting
  83            """
  84
  85            matching_files = glob.glob(os.path.join(folder, "*.*"))
  86
  87            print(matching_files)
  88
  89            for file in matching_files:
  90                os.remove(file)
  91
  92
  93    def getBestFit(self, q_times, distr=get_common_distributions(), show_summary=False):
  94        """
  95
  96            Convenience function for Fitter.
  97            Returns model and parameters that is considered
  98            the 'best fit'.
  99
 100            TODO: Determine how Fitter works this out
 101
 102        """
 103
 104        if(q_times.size > 0):
 105            if(len(distr) > 0):
 106                f = Fitter(q_times, timeout=60, distributions=distr)
 107            else:
 108                f = Fitter(q_times, timeout=60)
 109            f.fit()
 110            if show_summary == True:
 111                f.summary()
 112            return f.get_best()
 113        else:
 114            return {}
 115
 116
 117    def import_and_wrangle(self):
 118        """
 119
 120            Function to import CSV, add additional columns that are required
 121            and then sequentially execute other class functions to generate
 122            the probabilities and distributions required for the DES.
 123
 124            TODO: Additional logic is required to check the imported CSV
 125            for missing values, incorrect columns names etc.
 126
 127        """
 128
 129        try:
 130            df = pd.read_csv(self.file_path, quoting=QUOTE_ALL)
 131            self.df = df
 132
 133            # Perhaps run some kind of checking function here.
 134
 135        except FileNotFoundError:
 136            print(f"Cannot locate that file")
 137
 138        # If everything is okay, crack on...
 139        self.df['inc_date'] = pd.to_datetime(self.df['inc_date'])
 140        self.df['date_only'] = pd.to_datetime(df['inc_date'].dt.date)
 141        self.df['hour'] = self.df['inc_date'].dt.hour                      # Hour of the day
 142        self.df['day_of_week'] = self.df['inc_date'].dt.day_name()         # Day of the week (e.g., Monday)
 143        self.df['month'] = self.df['inc_date'].dt.month
 144        self.df['quarter'] = self.df['inc_date'].dt.quarter
 145        self.df['first_day_of_month'] = self.df['inc_date'].to_numpy().astype('datetime64[M]')
 146
 147        # Replacing a upper quartile limit on job cycle times and
 148        # instead using a manually specified time frame.
 149        # This has the advantage of allowing manual amendment of the falues
 150        # on the front-end
 151        # self.max_values_df = self.upper_allowable_time_bounds()
 152        self.min_max_values_df = pd.read_csv('actual_data/upper_allowable_time_bounds.csv')
 153        #print(self.min_max_values_df)
 154
 155        #This will be needed for other datasets, but has already been computed for DAA
 156        #self.df['ampds_card'] = self.df['ampds_code'].str[:2]
 157
 158        self.removeExistingResults(Utils.HISTORICAL_FOLDER)
 159        self.removeExistingResults(Utils.DISTRIBUTION_FOLDER)
 160
 161        #get proportions of AMPDS card by hour of day
 162        self.hour_by_ampds_card_probs()
 163
 164        # Determine 'best' distributions for time-based data
 165        self.activity_time_distributions()
 166
 167        # Calculate probability patient will be female based on AMPDS card
 168        self.sex_by_ampds_card_probs()
 169
 170        # Determine 'best' distributions for age ranges straitifed by AMPDS card
 171        self.age_distributions()
 172
 173        # Alternative approach to IA times. Start with probabilty of call at given hour stratified by quarter
 174        self.hourly_arrival_by_qtr_probs()
 175
 176        # Calculates the mean and standard deviation of the number of incidents per day stratified by quarter
 177        self.incidents_per_day()
 178        self.incidents_per_day_samples()
 179
 180        # Calculate probability of enhanced or critical care being required based on AMPDS card
 181        self.enhanced_or_critical_care_by_ampds_card_probs()
 182
 183        # Calculate HEMS result
 184        self.hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs()
 185
 186        # Calculate probability of callsign being allocated to a job based on AMPDS card and hour of day
 187        # self.callsign_group_by_ampds_card_and_hour_probs()
 188        # self.callsign_group_by_ampds_card_probs()
 189        # self.callsign_group_by_care_category()
 190        self.callsign_group()
 191
 192        # Calculate probability of a particular vehicle type based on callsign group and month of year
 193        # self.vehicle_type_by_month_probs()
 194        self.vehicle_type_by_quarter_probs()
 195        # self.vehicle_type_probs() # Similar to previous but without monthly stratification since ad hoc unavailability should account for this.
 196
 197        # Calculate the patient outcome (conveyed, deceased, unknown)
 198        self.patient_outcome_by_care_category_and_quarter_probs()
 199
 200        # ============= ARCHIVED CODE ================= #
 201        # Calculate the mean inter-arrival times stratified by yearly quarter and hour of day
 202        # self.inter_arrival_times()
 203        # ============= END ARCHIVED CODE ================= #
 204
 205
 206        # ============= ARCHIVED CODE ================= #
 207        # Calculate probably of patient outcome
 208        # Note - this still needs to be run to support another one?
 209        # Calculate probability of a specific patient outcome being allocated to a job based on HEMS result and callsign
 210        # self.pt_outcome_by_hems_result_and_care_category_probs()
 211        # ============= END ARCHIVED CODE ================= #
 212
 213        # ============= ARCHIVED CODE ================= #
 214        # self.hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs()
 215        # ============= END ARCHIVED CODE ================= #
 216
 217        # ============= ARCHIVED CODE ================= #
 218        # Calculate probabily of HEMS result being allocated to a job based on callsign and hour of day
 219        # self.hems_result_by_callsign_group_and_vehicle_type_probs()
 220        # ============= END ARCHIVED CODE ================= #
 221
 222        # ============= ARCHIVED CODE ================= #
 223        # Calculate probability of HEMS result being allocated to a job based on care category and helicopter benefit
 224        # self.hems_result_by_care_cat_and_helicopter_benefit_probs()
 225        # ============= END ARCHIVED CODE ================= #
 226
 227
 228        # Calculate school holidays since servicing schedules typically avoid these dates
 229        if self.calculate_school_holidays:
 230            self.school_holidays()
 231
 232        # Calculate historical data
 233        self.historical_monthly_totals()
 234        self.historical_monthly_totals_by_callsign()
 235        self.historical_monthly_totals_by_day_of_week()
 236        self.historical_median_time_of_activities_by_month_and_resource_type()
 237        self.historical_monthly_totals_by_hour_of_day()
 238        self.historical_monthly_resource_utilisation()
 239        self.historical_monthly_totals_all_calls()
 240        self.historical_daily_calls_breakdown()
 241        self.historical_job_durations_breakdown()
 242        self.historical_missed_jobs()
 243        self.historical_jobs_per_day_per_callsign()
 244        self.historical_care_cat_counts()
 245
 246        # Calculate proportions of ad hoc unavailability
 247        try:
 248            # self.ad_hoc_unavailability()
 249            self.ad_hoc_unavailability(period_start="2022-08-01", period_end="2024-07-31")
 250        except FileNotFoundError:
 251            print("Couldn't find ad-hoc unavailability file")
 252
 253    def hour_by_ampds_card_probs(self):
 254        """
 255
 256            Calculates the proportions of calls that are triaged with
 257            a specific AMPDS card. This is stratified by hour of day
 258
 259            TODO: Determine whether this should also be stratified by yearly quarter
 260
 261        """
 262        category_counts = self.df.groupby(['hour', 'ampds_card']).size().reset_index(name='count')
 263        total_counts = category_counts.groupby('hour')['count'].transform('sum')
 264        category_counts['proportion'] = round(category_counts['count'] / total_counts, 4)
 265
 266        #category_counts['ampds_card'] = category_counts['ampds_card'].apply(lambda x: str(x).zfill(2))
 267
 268        category_counts.to_csv('distribution_data/hour_by_ampds_card_probs.csv', mode="w+")
 269
 270
 271    def sex_by_ampds_card_probs(self):
 272        """
 273
 274            Calculates the probability that the patient will be female
 275            stratified by AMPDS card.
 276
 277        """
 278        age_df = self.df
 279        category_counts = age_df.groupby(['ampds_card', 'sex']).size().reset_index(name='count')
 280        total_counts = category_counts.groupby('ampds_card')['count'].transform('sum')
 281        category_counts['proportion'] = round(category_counts['count'] / total_counts, 3)
 282
 283        category_counts[category_counts['sex'] =='Female'].to_csv('distribution_data/sex_by_ampds_card_probs.csv', mode="w+")
 284
 285    def activity_time_distributions(self):
 286        """
 287
 288            Determine the 'best' distribution for each phase of a call
 289            i.e. Allocation time, Mobilisation time, Time to scene
 290            Time on scene, Travel time to hospital and handover, Time to clear.
 291            Not all times will apply to all cases, so the class 'times_to_fit'
 292            variable is a list of dictionaries, which contains the times to fit
 293
 294            The data is currently stratitied by HEMS_result and vehicle type fields.
 295
 296        """
 297
 298        vehicle_type = self.df['vehicle_type'].dropna().unique()
 299
 300        # We'll need to make sure that where a distribution is missing that the time is set to 0 in the model.
 301        # Probably easier than complicated logic to determine what times should be available based on hems_result
 302
 303        final_distr = []
 304
 305        for row in self.times_to_fit:
 306            #print(row)
 307            for ttf in row['times_to_fit']:
 308                for vt in vehicle_type:
 309                    #print(f"HEMS result is {row['hems_result']} times_to_fit is {ttf} and vehicle type is  {vt}")
 310
 311                    # This line might not be required if data quality is determined when importing the data
 312                    max_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].max_value_mins.iloc[0]
 313                    min_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].min_value_mins.iloc[0]
 314
 315                    #print(f"Max time is {max_time} and Min time is {min_time}")
 316
 317                    if ttf == 'time_on_scene':
 318                        # There is virtually no data for HEMS_result other than patient conveyed
 319                        # which is causing issues with fitting. For time on scene, will
 320                        # use a simplified fitting ignoring hems_result as a category
 321                        fit_times = self.df[
 322                            (self.df.vehicle_type == vt) &
 323                            (self.df[ttf] >= min_time) &
 324                            (self.df[ttf] <= max_time)
 325                        ][ttf]
 326                    else:
 327                        fit_times = self.df[
 328                            (self.df.vehicle_type == vt) &
 329                            (self.df[ttf] >= min_time) &
 330                            (self.df[ttf] <= max_time) &
 331                            (self.df.hems_result == row['hems_result'])
 332                        ][ttf]
 333                    #print(fit_times[:10])
 334                    best_fit = self.getBestFit(fit_times, distr=self.sim_tools_distr_plus)
 335                    #print(best_fit)
 336
 337                    return_dict = { "vehicle_type": vt, "time_type" : ttf, "best_fit": best_fit, "hems_result": row['hems_result'], "n": len(fit_times)}
 338                    #print(return_dict)
 339                    final_distr.append(return_dict)
 340
 341        with open('distribution_data/activity_time_distributions.txt', 'w+') as convert_file:
 342            convert_file.write(json.dumps(final_distr))
 343        convert_file.close()
 344
 345
 346    def age_distributions(self):
 347        """
 348
 349            Determine the 'best' distribution for age stratified by
 350            AMPDS card
 351
 352        """
 353
 354        age_distr = []
 355
 356        age_df = self.df[["age", "ampds_card"]].dropna()
 357        ampds_cards = age_df['ampds_card'].unique()
 358        print(ampds_cards)
 359
 360        for card in ampds_cards:
 361            fit_ages = age_df[age_df['ampds_card'] == card]['age']
 362            best_fit = self.getBestFit(fit_ages, distr=self.sim_tools_distr_plus)
 363            return_dict = { "ampds_card": str(card), "best_fit": best_fit, "n": len(fit_ages)}
 364            age_distr.append(return_dict)
 365
 366        with open('distribution_data/age_distributions.txt', 'w+') as convert_file:
 367            convert_file.write(json.dumps(age_distr))
 368        convert_file.close()
 369
 370
 371    # def inter_arrival_times(self):
 372    #     """
 373
 374    #         Calculate the mean inter-arrival times for patients
 375    #         stratified by hour, and and yearly quarter
 376
 377    #     """
 378
 379    #     ia_df = self.df[['date_only', 'quarter', 'hour']].dropna()
 380
 381    #     count_df = ia_df.groupby(['hour', 'date_only', 'quarter']).size().reset_index(name='n')
 382
 383    #     ia_times_df = (
 384    #         count_df.groupby(['hour', 'quarter'])
 385    #         .agg(
 386    #             # max_arrivals_per_hour=('n', lambda x: round(60 / np.max(x), 3)),
 387    #             # min_arrivals_per_hour=('n', lambda x: round(60 / np.min(x),3)),
 388    #             mean_cases=('n', lambda x: round(x.mean(), 1)),
 389    #             # sd_cases=('n', lambda x: round(x.std(), 3)),
 390    #             mean_iat=('n', lambda x: 60 / x.mean())
 391    #             # n=('n', 'size')
 392    #         )
 393    #         .reset_index()
 394    #     )
 395    #     # Additional column for NSPPThinning
 396    #     ia_times_df['t'] = ia_times_df['hour']
 397    #     ia_times_df['arrival_rate'] = ia_times_df['mean_iat'].apply(lambda x: 1/x)
 398
 399    #     ia_times_df.to_csv('distribution_data/inter_arrival_times.csv', mode='w+')
 400
 401
 402    def incidents_per_day(self):
 403        """
 404        Fit distributions for number of incidents per day using actual daily counts,
 405        applying year-based weighting to reflect trends (e.g., 2024 busier than 2023),
 406        stratified by season and quarter.
 407        """
 408        import math
 409        import json
 410        import numpy as np
 411
 412        inc_df = self.df[['inc_date', 'date_only', 'quarter']].copy()
 413        inc_df['year'] = inc_df['date_only'].dt.year
 414
 415        # Daily incident counts
 416        inc_per_day = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day')
 417        inc_per_day['year'] = inc_per_day['date_only'].dt.year
 418
 419        # Merge quarter and season from self.df
 420        date_info = self.df[['date_only', 'quarter']].drop_duplicates()
 421
 422        if 'season' not in self.df.columns:
 423            date_info['season'] = date_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 424        else:
 425            date_info = date_info.merge(
 426                self.df[['date_only', 'season']].drop_duplicates(),
 427                on='date_only',
 428                how='left'
 429            )
 430
 431        inc_per_day = inc_per_day.merge(date_info, on='date_only', how='left')
 432
 433        # Weight settings - simple implementation rather than biased mean thing
 434        year_weights = {
 435            2023: 1.0,
 436            2024: 4.0  # 10% more weight to 2024
 437        }
 438
 439        # ========== SEASONAL DISTRIBUTIONS ==========
 440        jpd_distr = []
 441
 442        for season in inc_per_day['season'].dropna().unique():
 443            filtered = inc_per_day[inc_per_day['season'] == season].copy()
 444            filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0)
 445
 446            # Repeat rows proportionally by weight
 447            replicated = filtered.loc[
 448                filtered.index.repeat((filtered['weight'] * 10).round().astype(int))
 449            ]['jobs_per_day']
 450
 451            best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus)
 452
 453            jpd_distr.append({
 454                "season": season,
 455                "best_fit": best_fit,
 456                "min_n_per_day": int(replicated.min()),
 457                "max_n_per_day": int(replicated.max()),
 458                "mean_n_per_day": float(replicated.mean())
 459            })
 460
 461        with open('distribution_data/inc_per_day_distributions.txt', 'w+') as f:
 462            json.dump(jpd_distr, f)
 463
 464        # ========== QUARTERLY DISTRIBUTIONS ==========
 465        jpd_qtr_distr = []
 466
 467        for quarter in inc_per_day['quarter'].dropna().unique():
 468            filtered = inc_per_day[inc_per_day['quarter'] == quarter].copy()
 469            filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0)
 470
 471            replicated = filtered.loc[
 472                filtered.index.repeat((filtered['weight'] * 10).round().astype(int))
 473            ]['jobs_per_day']
 474
 475            best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus)
 476
 477            jpd_qtr_distr.append({
 478                "quarter": int(quarter),
 479                "best_fit": best_fit,
 480                "min_n_per_day": int(replicated.min()),
 481                "max_n_per_day": int(replicated.max()),
 482                "mean_n_per_day": float(replicated.mean())
 483            })
 484
 485        with open('distribution_data/inc_per_day_qtr_distributions.txt', 'w+') as f:
 486            json.dump(jpd_qtr_distr, f)
 487
 488    def incidents_per_day_samples(self, weight_map=None, scale_factor=10):
 489        """
 490            Create weighted empirical samples of incidents per day by season and quarter.
 491        """
 492
 493        inc_df = self.df[['date_only', 'quarter']].copy()
 494        inc_df['year'] = inc_df['date_only'].dt.year
 495        inc_df['season'] = inc_df['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 496
 497        # Get raw counts per day
 498        daily_counts = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day')
 499        daily_counts['year'] = daily_counts['date_only'].dt.year
 500
 501        # Merge back in season/quarter info
 502        meta_info = self.df[['date_only', 'quarter']].drop_duplicates()
 503        if 'season' in self.df.columns:
 504            meta_info = meta_info.merge(
 505                self.df[['date_only', 'season']].drop_duplicates(),
 506                on='date_only', how='left'
 507            )
 508        else:
 509            meta_info['season'] = meta_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 510
 511        daily_counts = daily_counts.merge(meta_info, on='date_only', how='left')
 512
 513        # Year weight map
 514        if weight_map is None:
 515            weight_map = {2023: 1.0, 2024: 1.1}
 516
 517        # Compute weights
 518        daily_counts['weight'] = daily_counts['year'].map(weight_map).fillna(1.0)
 519
 520        # Storage
 521        empirical_samples = {}
 522
 523        # Season-based
 524        for season in daily_counts['season'].dropna().unique():
 525            filtered = daily_counts[daily_counts['season'] == season].copy()
 526            repeated = filtered.loc[
 527                filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int))
 528            ]['jobs_per_day'].tolist()
 529
 530            empirical_samples[season] = repeated
 531
 532        # Quarter-based
 533        for quarter in daily_counts['quarter'].dropna().unique():
 534            filtered = daily_counts[daily_counts['quarter'] == quarter].copy()
 535            repeated = filtered.loc[
 536                filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int))
 537            ]['jobs_per_day'].tolist()
 538
 539            empirical_samples[f"Q{int(quarter)}"] = repeated
 540
 541        with open("distribution_data/inc_per_day_samples.json", 'w') as f:
 542            json.dump(empirical_samples, f)
 543
 544
 545    def enhanced_or_critical_care_by_ampds_card_probs(self):
 546        """
 547
 548            Calculates the probabilty of enhanced or critical care resource beign required
 549            based on the AMPDS card
 550
 551        """
 552
 553        ec_df = self.df[['ampds_card', 'ec_benefit', 'cc_benefit']].copy()
 554
 555        def assign_care_category(row):
 556            # There are some columns with both EC and CC benefit selected
 557            # this function will allocate to only 1
 558            if row['cc_benefit'] == 'y':
 559                return 'CC'
 560            elif row['ec_benefit'] == 'y':
 561                return 'EC'
 562            else:
 563                return 'REG'
 564
 565        ec_df['care_category'] = ec_df.apply(assign_care_category, axis = 1)
 566
 567        care_cat_counts = ec_df.groupby(['ampds_card', 'care_category']).size().reset_index(name='count')
 568        total_counts = care_cat_counts.groupby('ampds_card')['count'].transform('sum')
 569
 570        care_cat_counts['proportion'] = round(care_cat_counts['count'] / total_counts, 3)
 571
 572        care_cat_counts.to_csv('distribution_data/enhanced_or_critical_care_by_ampds_card_probs.csv', mode = "w+", index = False)
 573
 574    def patient_outcome_by_care_category_and_quarter_probs(self):
 575        """
 576
 577            Calculates the probabilty of a patient outcome based on care category and yearly quarter
 578
 579        """
 580
 581        po_df = self.df[['quarter', 'ec_benefit', 'cc_benefit', 'pt_outcome']].copy()
 582
 583        def assign_care_category(row):
 584            # There are some columns with both EC and CC benefit selected
 585            # this function will allocate to only 1
 586            if row['cc_benefit'] == 'y':
 587                return 'CC'
 588            elif row['ec_benefit'] == 'y':
 589                return 'EC'
 590            else:
 591                return 'REG'
 592
 593        # There are some values that are missing e.g. CC quarter 1 Deceased
 594        # I think we've had problems when trying to sample from this kind of thing before
 595        # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 596        outcomes = po_df['pt_outcome'].unique()
 597        care_categories = ['CC', 'EC', 'REG']
 598        quarters = po_df['quarter'].unique()
 599
 600        all_combinations = pd.DataFrame(list(itertools.product(outcomes, care_categories, quarters)),
 601                                    columns=['pt_outcome', 'care_category', 'quarter'])
 602
 603        po_df['care_category'] = po_df.apply(assign_care_category, axis = 1)
 604
 605        po_cat_counts = po_df.groupby(['pt_outcome', 'care_category', 'quarter']).size().reset_index(name='count')
 606
 607        merged = pd.merge(all_combinations, po_cat_counts,
 608                      on=['pt_outcome', 'care_category', 'quarter'],
 609                      how='left').fillna({'count': 0})
 610        merged['count'] = merged['count'].astype(int)
 611
 612        total_counts = merged.groupby(['care_category', 'quarter'])['count'].transform('sum')
 613        merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 614
 615        merged.to_csv('distribution_data/patient_outcome_by_care_category_and_quarter_probs.csv', mode = "w+", index = False)
 616
 617    # def hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs(self):
 618    #     """
 619
 620    #         Calculates the probabilty of a given HEMS result based on
 621    #         patient outcome, yearly quarter, vehicle type and callsign group
 622
 623    #     """
 624
 625    #     hr_df = self.df[['hems_result', 'quarter', 'pt_outcome', 'vehicle_type', 'callsign_group']].copy()
 626
 627    #     # There are some values that are missing e.g. CC quarter 1 Deceased
 628    #     # I think we've had problems when trying to sample from this kind of thing before
 629    #     # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 630    #     # hems_results = hr_df['hems_result'].unique()
 631    #     # outcomes = hr_df['pt_outcome'].unique()
 632    #     # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)]
 633    #     # callsign_group_categories = hr_df['callsign_group'].unique()
 634    #     # quarters = hr_df['quarter'].unique()
 635
 636    #     # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)),
 637    #     #                             columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])
 638
 639    #     hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']).size().reset_index(name='count')
 640
 641    #     # merged = pd.merge(all_combinations, hr_cat_counts,
 642    #     #               on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'],
 643    #     #               how='left').fillna({'count': 0})
 644    #     # merged['count'] = merged['count'].astype(int)
 645
 646    #     merged = hr_cat_counts
 647
 648    #     total_counts = merged.groupby(['pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])['count'].transform('sum')
 649    #     merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 650
 651    #     merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False)
 652
 653    def hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs(self):
 654        """
 655
 656            Calculates the probabilty of a given HEMS result based on
 657                - patient outcome
 658                - yearly quarter
 659                - time of day (7am - 6pm, 7pm - 6am)
 660                - vehicle type
 661                - and callsign group
 662
 663        """
 664        self.df['inc_date'] = pd.to_datetime(self.df['inc_date'])
 665        self.df['hour'] = self.df['inc_date'].dt.hour
 666        self.df['time_of_day'] = self.df['hour'].apply(lambda x: 'day' if x >= 7 and x <= 18 else "night")
 667
 668        hr_df = self.df[[
 669            'hems_result', 'quarter', 'pt_outcome',
 670            'vehicle_type', 'callsign_group', 'time_of_day'
 671            ]].copy()
 672
 673        # There are some values that are missing e.g. CC quarter 1 Deceased
 674        # I think we've had problems when trying to sample from this kind of thing before
 675        # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 676        # hems_results = hr_df['hems_result'].unique()
 677        # outcomes = hr_df['pt_outcome'].unique()
 678        # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)]
 679        # callsign_group_categories = hr_df['callsign_group'].unique()
 680        # quarters = hr_df['quarter'].unique()
 681
 682        # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)),
 683        #                             columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])
 684
 685        hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome',
 686                                       'vehicle_type', 'callsign_group',
 687                                       'quarter', 'time_of_day']).size().reset_index(name='count')
 688
 689        # merged = pd.merge(all_combinations, hr_cat_counts,
 690        #               on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'],
 691        #               how='left').fillna({'count': 0})
 692        # merged['count'] = merged['count'].astype(int)
 693
 694        merged = hr_cat_counts
 695
 696        total_counts = merged.groupby(['pt_outcome',
 697                                       'vehicle_type', 'callsign_group',
 698                                       'quarter', 'time_of_day'])['count'].transform('sum')
 699        merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 700
 701        merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False)
 702
 703
 704
 705    def hourly_arrival_by_qtr_probs(self):
 706        """
 707
 708            Calculates the proportions of calls arriving in any given hour
 709            stratified by yearly quarter
 710
 711        """
 712
 713        ia_df = self.df[['quarter', 'hour']].dropna()
 714
 715        hourly_counts = ia_df.groupby(['hour', 'quarter']).size().reset_index(name='count')
 716        total_counts = hourly_counts.groupby(['quarter'])['count'].transform('sum')
 717        hourly_counts['proportion'] = round(hourly_counts['count'] / total_counts, 4)
 718
 719        hourly_counts.sort_values(by=['quarter', 'hour']).to_csv('distribution_data/hourly_arrival_by_qtr_probs.csv', mode="w+")
 720
 721
 722    def callsign_group_by_ampds_card_and_hour_probs(self):
 723        """
 724
 725            Calculates the probabilty of a specific callsign being allocated to
 726            a call based on the AMPDS card category and hour of day
 727
 728        """
 729        callsign_counts = self.df.groupby(['ampds_card', 'hour', 'callsign_group']).size().reset_index(name='count')
 730
 731        total_counts = callsign_counts.groupby(['ampds_card', 'hour'])['count'].transform('sum')
 732        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 733
 734        callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_and_hour_probs.csv', mode = "w+", index=False)
 735
 736    def callsign_group_by_ampds_card_probs(self):
 737        """
 738
 739            Calculates the probabilty of a specific callsign being allocated to
 740            a call based on the AMPDS card category
 741
 742        """
 743
 744        callsign_df = self.df[self.df['callsign_group'] != 'Other']
 745
 746        callsign_counts = callsign_df.groupby(['ampds_card', 'callsign_group']).size().reset_index(name='count')
 747
 748        total_counts = callsign_counts.groupby(['ampds_card'])['count'].transform('sum')
 749        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 750
 751        callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_probs.csv', mode = "w+", index=False)
 752
 753    def callsign_group(self):
 754        """
 755            Calculates the probabilty of a specific callsign being allocated to
 756            a call
 757        """
 758        df = self.df.copy()
 759
 760        # Convert time fields to numeric
 761        time_fields = [
 762            "time_allocation", "time_mobile", "time_to_scene",
 763            "time_on_scene", "time_to_hospital", "time_to_clear"
 764        ]
 765        for col in time_fields:
 766            df[col] = pd.to_numeric(df[col], errors="coerce")
 767
 768        # Calculate total job duration in minutes
 769        df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True)
 770
 771        # Compute job start and end times
 772        df["start_time"] = df["inc_date"]
 773        df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m")
 774
 775        # Sort jobs by start time
 776        df = df.sort_values(by="start_time").reset_index(drop=True)
 777
 778        # Set to hold indices of jobs that overlap (but only the later-starting ones)
 779        overlapping = set()
 780
 781        # Check for overlaps
 782        for i in range(len(df)):
 783            this_end = df.at[i, "end_time"]
 784
 785            # Compare only to later jobs
 786            for j in range(i + 1, len(df)):
 787                next_start = df.at[j, "start_time"]
 788                if next_start >= this_end:
 789                    break  # No more possible overlaps
 790                # If it starts before i's job ends, it's overlapping
 791                overlapping.add(j)
 792
 793        # Mark the overlaps in the dataframe
 794        df["overlaps"] = df.index.isin(overlapping)
 795
 796        # Filter out overlapping jobs
 797        df_no_overlap = df[~df["overlaps"]]
 798
 799        # We will use the ad-hoc unavailability to remove any instances where we already know one of
 800        # the vehicles to be recorded as offline
 801
 802        data = df_no_overlap.copy()
 803
 804        # TODO: Ideally we'd also remove any instances where we know one of the helos to have been
 805        # off for servicing if that data is available
 806        ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"])
 807        ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower()
 808
 809        data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601")
 810        data["vehicle"] = data["vehicle"].str.lower()
 811
 812        # Create a cross-join between data and ad_hoc
 813        data['key'] = 1
 814        ad_hoc['key'] = 1
 815        merged = data.merge(ad_hoc, on='key')
 816
 817        # Keep rows where inc_date falls within the offline period
 818        overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])]
 819
 820        # Filter out those rows from the original data
 821        df_no_overlap = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key')
 822
 823        callsign_df = (
 824            df_no_overlap
 825            .assign(
 826                helicopter_benefit=np.select(
 827                    [
 828                        df_no_overlap["cc_benefit"] == "y",
 829                        df_no_overlap["ec_benefit"] == "y",
 830                        df_no_overlap["hems_result"].isin([
 831                            "Stand Down En Route",
 832                            "Landed but no patient contact",
 833                            "Stand Down Before Mobile"
 834                        ])
 835                    ],
 836                    ["y", "y", "n"],
 837                    default=df_no_overlap["helicopter_benefit"]
 838                ),
 839                care_category=np.select(
 840                    [
 841                        df_no_overlap["cc_benefit"] == "y",
 842                        df_no_overlap["ec_benefit"] == "y"
 843                    ],
 844                    ["CC", "EC"],
 845                    default="REG"
 846                )
 847            )
 848        )
 849
 850        callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other']
 851
 852        callsign_counts = callsign_df.groupby(['callsign_group']).size().reset_index(name='count')
 853
 854        total_counts = len(callsign_df)
 855        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 856
 857        callsign_counts.to_csv('distribution_data/callsign_group_probs.csv', mode = "w+", index=False)
 858
 859
 860    # def callsign_group_by_care_category(self):
 861    #     """
 862
 863    #         Calculates the probabilty of a specific callsign being allocated to
 864    #         a call based on the care category
 865
 866    #     """
 867    #     df = self.df.copy()
 868
 869    #     # Convert time fields to numeric
 870    #     time_fields = [
 871    #         "time_allocation", "time_mobile", "time_to_scene",
 872    #         "time_on_scene", "time_to_hospital", "time_to_clear"
 873    #     ]
 874    #     for col in time_fields:
 875    #         df[col] = pd.to_numeric(df[col], errors="coerce")
 876
 877    #     # Calculate total job duration in minutes
 878    #     df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True)
 879
 880    #     # Compute job start and end times
 881    #     df["start_time"] = df["inc_date"]
 882    #     df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m")
 883
 884    #     # Sort jobs by start time
 885    #     df = df.sort_values(by="start_time").reset_index(drop=True)
 886
 887    #     # Set to hold indices of jobs that overlap (but only the later-starting ones)
 888    #     overlapping = set()
 889
 890    #     # Check for overlaps
 891    #     for i in range(len(df)):
 892    #         this_end = df.at[i, "end_time"]
 893
 894    #         # Compare only to later jobs
 895    #         for j in range(i + 1, len(df)):
 896    #             next_start = df.at[j, "start_time"]
 897    #             if next_start >= this_end:
 898    #                 break  # No more possible overlaps
 899    #             # If it starts before i's job ends, it's overlapping
 900    #             overlapping.add(j)
 901
 902    #     # Mark the overlaps in the dataframe
 903    #     df["overlaps"] = df.index.isin(overlapping)
 904
 905    #     # Filter out overlapping jobs
 906    #     df_no_overlap = df[~df["overlaps"]]
 907
 908
 909    #     callsign_df = (
 910    #         df_no_overlap
 911    #         .assign(
 912    #             helicopter_benefit=np.select(
 913    #                 [
 914    #                     df_no_overlap["cc_benefit"] == "y",
 915    #                     df_no_overlap["ec_benefit"] == "y",
 916    #                     df_no_overlap["hems_result"].isin([
 917    #                         "Stand Down En Route",
 918    #                         "Landed but no patient contact",
 919    #                         "Stand Down Before Mobile"
 920    #                     ])
 921    #                 ],
 922    #                 ["y", "y", "n"],
 923    #                 default=df_no_overlap["helicopter_benefit"]
 924    #             ),
 925    #             care_category=np.select(
 926    #                 [
 927    #                     df_no_overlap["cc_benefit"] == "y",
 928    #                     df_no_overlap["ec_benefit"] == "y"
 929    #                 ],
 930    #                 ["CC", "EC"],
 931    #                 default="REG"
 932    #             )
 933    #         )
 934    #     )
 935
 936    #     callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other']
 937
 938    #     callsign_counts = callsign_df.groupby(['care_category', 'callsign_group']).size().reset_index(name='count')
 939
 940    #     total_counts = callsign_counts.groupby(['care_category'])['count'].transform('sum')
 941    #     callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 942
 943    #     callsign_counts.to_csv('distribution_data/callsign_group_by_care_category_probs.csv', mode = "w+", index=False)
 944
 945    #========== ARCHIVED CODE ============ #
 946    # def vehicle_type_by_month_probs(self):
 947    #     """
 948
 949    #         Calculates the probabilty of a car/helicopter being allocated to
 950    #         a call based on the callsign group and month of the year
 951
 952    #     """
 953    #     callsign_counts = self.df.groupby(['callsign_group', 'month', 'vehicle_type']).size().reset_index(name='count')
 954
 955    #     total_counts = callsign_counts.groupby(['callsign_group', 'month'])['count'].transform('sum')
 956    #     callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 957
 958    #     callsign_counts.to_csv('distribution_data/vehicle_type_by_month_probs.csv', mode = "w+")
 959    #========== END ARCHIVED CODE ============ #
 960    def vehicle_type_by_quarter_probs(self):
 961        """
 962
 963            Calculates the probabilty of a car/helicopter being allocated to
 964            a call based on the callsign group and quarter of the year
 965
 966            Quarter accounts for seasonal variation without being as affected by
 967
 968        """
 969        data = self.df.copy()
 970
 971        # We will use the ad-hoc unavailability to remove any instances where we already know one of
 972        # the vehicles to be recorded as offline
 973
 974        # TODO: Ideally we'd also remove any instances where we know one of the helos to have been
 975        # off for servicing if that data is available
 976        ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"])
 977        ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower()
 978
 979        data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601")
 980        data["vehicle"] = data["vehicle"].str.lower()
 981
 982        # Create a cross-join between data and ad_hoc
 983        data['key'] = 1
 984        ad_hoc['key'] = 1
 985        merged = data.merge(ad_hoc, on='key')
 986
 987        # Keep rows where inc_date falls within the offline period
 988        overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])]
 989
 990        # Filter out those rows from the original data
 991        filtered_data = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key')
 992
 993        # First, calculate overall props
 994        callsign_counts = filtered_data.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count')
 995
 996        total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum')
 997        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 998
 999        callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")
1000
1001        # Then, redo by quarter
1002        callsign_counts = filtered_data.groupby(['callsign_group', 'quarter', 'vehicle_type']).size().reset_index(name='count')
1003
1004        total_counts = callsign_counts.groupby(['callsign_group', 'quarter'])['count'].transform('sum')
1005        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
1006
1007        callsign_counts.to_csv('distribution_data/vehicle_type_by_quarter_probs.csv', mode = "w+")
1008
1009    def vehicle_type_probs(self):
1010        """
1011
1012            Calculates the probabilty of a car/helicopter being allocated to
1013            a call based on the callsign group
1014
1015        """
1016
1017        callsign_counts = self.df.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count')
1018
1019        total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum')
1020        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
1021
1022        callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")
1023
1024    def hems_result_by_callsign_group_and_vehicle_type_probs(self):
1025        """
1026
1027            Calculates the probabilty of a specific HEMS result being allocated to
1028            a call based on the callsign group and hour of day
1029
1030            TODO: These probability calculation functions could probably be refactored into a single
1031            function and just specify columns and output name
1032
1033        """
1034        hems_counts = self.df.groupby(['hems_result', 'callsign_group', 'vehicle_type']).size().reset_index(name='count')
1035
1036        total_counts = hems_counts.groupby(['callsign_group', 'vehicle_type'])['count'].transform('sum')
1037        hems_counts['proportion'] = round(hems_counts['count'] / total_counts, 4)
1038
1039        hems_counts.to_csv('distribution_data/hems_result_by_callsign_group_and_vehicle_type_probs.csv', mode = "w+", index=False)
1040
1041
1042    #========== ARCHIVED CODE ============ #
1043    # def hems_result_by_care_cat_and_helicopter_benefit_probs(self):
1044    #     """
1045
1046    #         Calculates the probabilty of a specific HEMS result being allocated to
1047    #         a call based on the care category amd whether a helicopter is beneficial
1048
1049    #     """
1050
1051    #     # Wrangle the data...trying numpy for a change
1052
1053    #     hems_df = (
1054    #         self.df
1055    #         .assign(
1056    #             helicopter_benefit=np.select(
1057    #                 [
1058    #                     self.df["cc_benefit"] == "y",
1059    #                     self.df["ec_benefit"] == "y",
1060    #                     self.df["hems_result"].isin([
1061    #                         "Stand Down En Route",
1062    #                         "Landed but no patient contact",
1063    #                         "Stand Down Before Mobile"
1064    #                     ])
1065    #                 ],
1066    #                 ["y", "y", "n"],
1067    #                 default=self.df["helicopter_benefit"]
1068    #             ),
1069    #             care_cat=np.select(
1070    #                 [
1071    #                     self.df["cc_benefit"] == "y",
1072    #                     self.df["ec_benefit"] == "y"
1073    #                 ],
1074    #                 ["CC", "EC"],
1075    #                 default="REG"
1076    #             )
1077    #         )
1078    #     )
1079
1080    #     hems_counts = hems_df.groupby(['hems_result', 'care_cat', 'helicopter_benefit']).size().reset_index(name='count')
1081
1082    #     hems_counts['total'] = hems_counts.groupby(['care_cat', 'helicopter_benefit'])['count'].transform('sum')
1083    #     hems_counts['proportion'] = round(hems_counts['count'] / hems_counts['total'], 4)
1084
1085    #     hems_counts.to_csv('distribution_data/hems_result_by_care_cat_and_helicopter_benefit_probs.csv', mode = "w+", index=False)
1086    #========== END ARCHIVED CODE ============ #
1087
1088    #========== ARCHIVED CODE ============ #
1089    # def pt_outcome_by_hems_result_and_care_category_probs(self):
1090    #     """
1091
1092    #         Calculates the probabilty of a specific patient outcome based on HEMS result
1093
1094    #     """
1095
1096    #     hems_df = (
1097    #         self.df
1098    #         .assign(
1099    #             helicopter_benefit=np.select(
1100    #                 [
1101    #                     self.df["cc_benefit"] == "y",
1102    #                     self.df["ec_benefit"] == "y",
1103    #                     self.df["hems_result"].isin([
1104    #                         "Stand Down En Route",
1105    #                         "Landed but no patient contact",
1106    #                         "Stand Down Before Mobile"
1107    #                     ])
1108    #                 ],
1109    #                 ["y", "y", "n"],
1110    #                 default=self.df["helicopter_benefit"]
1111    #             ),
1112    #             care_category=np.select(
1113    #                 [
1114    #                     self.df["cc_benefit"] == "y",
1115    #                     self.df["ec_benefit"] == "y"
1116    #                 ],
1117    #                 ["CC", "EC"],
1118    #                 default="REG"
1119    #             )
1120    #         )
1121    #     )
1122
1123    #     po_counts = hems_df.groupby(['pt_outcome', 'hems_result', 'care_category']).size().reset_index(name='count')
1124
1125    #     po_counts['total'] = po_counts.groupby(['hems_result', 'care_category'])['count'].transform('sum')
1126    #     po_counts['proportion'] = round(po_counts['count'] / po_counts['total'], 4)
1127
1128    #     po_counts.to_csv('distribution_data/pt_outcome_by_hems_result_and_care_category_probs.csv', mode = "w+")
1129    #========== END ARCHIVED CODE ============ #
1130
1131    def school_holidays(self) -> None:
1132        """"
1133            Function to generate a CSV file containing schoole holiday
1134            start and end dates for a given year. The Year range is determined
1135            by the submitted data (plus a year at the end of the study for good measure)
1136        """
1137
1138        min_date = self.df.inc_date.min()
1139        max_date = self.df.inc_date.max() + timedelta(weeks = (52 * self.school_holidays_years))
1140
1141        u = Utils()
1142
1143        years_of_holidays_list = u.years_between(min_date, max_date)
1144
1145        sh = pd.DataFrame(columns=['year', 'start_date', 'end_date'])
1146
1147        for i, year in enumerate(years_of_holidays_list):
1148            tmp = u.calculate_term_holidays(year)
1149
1150            if i == 0:
1151                sh = tmp
1152            else:
1153                sh = pd.concat([sh, tmp])
1154
1155        sh.to_csv('actual_data/school_holidays.csv', index = False)
1156
1157
1158# These functions are to wrangle historical data to provide comparison against the simulation outputs
1159
1160    def historical_jobs_per_day_per_callsign(self):
1161        df = self.df
1162
1163        df["date"] = pd.to_datetime(df["inc_date"]).dt.date
1164        all_counts_hist = df.groupby(["date", "callsign"])["job_id"].count().reset_index()
1165        all_counts_hist.rename(columns={'job_id':'jobs_in_day'}, inplace=True)
1166
1167        all_combinations = pd.DataFrame(
1168            list(itertools.product(df['date'].unique(), df['callsign'].unique())),
1169            columns=['date', 'callsign']
1170        ).dropna()
1171
1172        merged = all_combinations.merge(all_counts_hist, on=['date', 'callsign'], how='left')
1173        merged['jobs_in_day'] = merged['jobs_in_day'].fillna(0).astype(int)
1174
1175        all_counts = merged.groupby(['callsign', 'jobs_in_day']).count().reset_index().rename(columns={"date":"count"})
1176        all_counts.to_csv("historical_data/historical_jobs_per_day_per_callsign.csv", index=False)
1177
1178    def historical_care_cat_counts(self):
1179        """
1180        Process historical incident data to categorize care types and compute hourly counts.
1181
1182        This method performs the following steps:
1183        - Converts incident dates to datetime format.
1184        - Extracts month start and hour from the incident date.
1185        - Categorizes each incident into care categories based on benefit flags and attendance.
1186        - Counts the number of incidents by hour and care category.
1187        - Outputs these counts to a CSV file.
1188        - Computes and writes the proportion of regular care jobs with a helicopter benefit
1189        (excluding those not attended by a DAA resource) to a text file.
1190
1191        Outputs:
1192        - CSV file: 'historical_data/historical_care_cat_counts.csv'
1193        - Text file: 'distribution_data/proportion_jobs_heli_benefit.txt'
1194        """
1195
1196        df_historical = self.df
1197
1198        df_historical['inc_date'] = pd.to_datetime(df_historical['inc_date'])
1199        # Extract the first day of the month and the hour of each incident
1200        df_historical['month_start'] = df_historical.inc_date.dt.strftime("%Y-%m-01")
1201        df_historical['hour'] = df_historical.inc_date.dt.hour
1202
1203        conditions = [
1204            df_historical['cc_benefit'] == 'y',
1205            df_historical['ec_benefit'] == 'y',
1206            df_historical['helicopter_benefit'] == 'y',
1207            df_historical['callsign_group'] == 'Other'
1208        ]
1209
1210        choices = [
1211            'CC',
1212            'EC',
1213            'REG - helicopter benefit',
1214            'Unknown - DAA resource did not attend'
1215        ]
1216        # Assign care category to each record
1217        # If the case did not meet any of the criteria in 'conditions', it will default
1218        # to being labelled as a 'regular/REG' case (i.e there was no benefit recorded)
1219        df_historical['care_category'] = np.select(conditions, choices, default='REG')
1220
1221        # Count occurrences grouped by hour and care category
1222        historical_value_counts_by_hour = (
1223            df_historical.value_counts(["hour", "care_category"])
1224            .reset_index(name="count")
1225            )
1226        # Output to CSV for use in tests and visualisations
1227        (historical_value_counts_by_hour
1228         .sort_values(['hour', 'care_category'])
1229         .to_csv("historical_data/historical_care_cat_counts.csv"))
1230
1231        # Also output the % of regular (not cc/ec) jobs with a helicopter benefit
1232        # These are the regular jobs we will make an assumption follow different logic due to having an obvious expected
1233        # patient benefit of having a helicopter allocated to them that we will have to assume is apparent at the time
1234        # of the call being placed (such as the casualty being located in a remote location, or )
1235
1236        numerator = (historical_value_counts_by_hour[
1237            historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit"
1238            ]["count"].sum())
1239
1240        denominator = (historical_value_counts_by_hour[
1241            (historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit") |
1242            (historical_value_counts_by_hour["care_category"] == "REG")
1243            ]["count"].sum())
1244
1245        with open('distribution_data/proportion_jobs_heli_benefit.txt', 'w+') as heli_benefit_file:
1246            heli_benefit_file.write(json.dumps((numerator/denominator).round(4)))
1247
1248         # Count occurrences grouped by hour and care category
1249        historical_value_counts_by_hour_cc_ec = (
1250            df_historical.value_counts(["hour", "care_category", "helicopter_benefit"])
1251            .reset_index(name="count")
1252            )
1253
1254       # Output to CSV for use in tests and visualisations
1255        # (historical_value_counts_by_hour_cc_ec
1256        #  .sort_values(["hour", "care_category", "helicopter_benefit"])
1257        #  .to_csv("historical_data/historical_care_cat_counts_cc_ec.csv"))
1258
1259
1260        numerator_cc = (
1261            historical_value_counts_by_hour_cc_ec[
1262                (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") &
1263                (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y")
1264                ]["count"].sum())
1265
1266        denominator_cc = (
1267            historical_value_counts_by_hour_cc_ec[
1268                (historical_value_counts_by_hour_cc_ec["care_category"] == "CC")
1269                ]["count"].sum())
1270
1271        with open('distribution_data/proportion_jobs_heli_benefit_cc.txt', 'w+') as heli_benefit_file:
1272            heli_benefit_file.write(json.dumps((numerator_cc/denominator_cc).round(4)))
1273
1274        numerator_ec = (
1275            historical_value_counts_by_hour_cc_ec[
1276                (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") &
1277                (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y")
1278                ]["count"].sum())
1279
1280        denominator_ec = (
1281            historical_value_counts_by_hour_cc_ec[
1282                (historical_value_counts_by_hour_cc_ec["care_category"] == "EC")
1283                ]["count"].sum())
1284
1285        with open('distribution_data/proportion_jobs_heli_benefit_ec.txt', 'w+') as heli_benefit_file:
1286            heli_benefit_file.write(json.dumps((numerator_ec/denominator_ec).round(4)))
1287
1288
1289
1290    def historical_monthly_totals(self):
1291        """
1292            Calculates monthly incident totals from provided dataset of historical data
1293        """
1294
1295        # Multiple resources can be sent to the same job.
1296        monthly_df = self.df[['inc_date', 'first_day_of_month', 'hems_result', 'vehicle_type']]\
1297            .drop_duplicates(subset="inc_date", keep="first")
1298
1299        is_stand_down = monthly_df['hems_result'].str.contains("Stand Down")
1300        monthly_df['stand_down_car'] = ((monthly_df['vehicle_type'] == "car") & is_stand_down).astype(int)
1301        monthly_df['stand_down_helicopter'] = ((monthly_df['vehicle_type'] == "helicopter") & is_stand_down).astype(int)
1302
1303        monthly_totals_df = monthly_df.groupby('first_day_of_month').agg(
1304                                stand_down_car=('stand_down_car', 'sum'),
1305                                stand_down_helicopter=('stand_down_helicopter', 'sum'),
1306                                total_jobs=('vehicle_type', 'size')
1307                            ).reset_index()
1308
1309        monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_jobs_per_month.csv', mode="w+", index=False)
1310
1311    def historical_monthly_totals_all_calls(self):
1312        """
1313            Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1314        """
1315
1316        # Multiple resources can be sent to the same job.
1317        monthly_df = self.df[['inc_date', 'first_day_of_month']].dropna()
1318
1319        monthly_totals_df = monthly_df.groupby(['first_day_of_month']).count().reset_index()
1320
1321        monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_all_calls.csv', mode="w+", index=False)
1322
1323    def historical_monthly_totals_by_callsign(self):
1324        """
1325            Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1326        """
1327
1328        # Multiple resources can be sent to the same job.
1329        monthly_df = self.df[['inc_date', 'first_day_of_month', 'callsign']].dropna()
1330
1331        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'callsign']).count().reset_index()
1332
1333        #print(monthly_totals_df.head())
1334
1335        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1336
1337        #print(monthly_totals_pivot_df.head())
1338
1339        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_callsign.csv', mode="w+", index=False)
1340
1341    def historical_monthly_totals_by_hour_of_day(self):
1342        """
1343            Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day
1344        """
1345
1346        # Multiple resources can be sent to the same job.
1347        monthly_df = self.df[['inc_date', 'first_day_of_month', 'hour']].dropna()\
1348            .drop_duplicates(subset="inc_date", keep="first")
1349
1350        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'hour']).count().reset_index()
1351
1352        #print(monthly_totals_df.head())
1353
1354        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='hour', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1355
1356        #print(monthly_totals_pivot_df.head())
1357
1358        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_hour_of_day.csv', mode="w+", index=False)
1359
1360    def historical_monthly_totals_by_day_of_week(self):
1361        """
1362            Calculates number of incidents per month stratified by day of the week
1363        """
1364
1365        # Multiple resources can be sent to the same job.
1366        monthly_df = self.df[['inc_date', 'first_day_of_month', 'day_of_week']].dropna()\
1367            .drop_duplicates(subset="inc_date", keep="first")
1368
1369        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'day_of_week']).count().reset_index()
1370
1371        #print(monthly_totals_df.head())
1372
1373        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='day_of_week', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1374
1375        #print(monthly_totals_pivot_df.head())
1376
1377        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_day_of_week.csv', mode="w+", index=False)
1378
1379    def historical_median_time_of_activities_by_month_and_resource_type(self):
1380        """
1381            Calculate the median time for each of the job cycle phases stratified by month and vehicle type
1382        """
1383
1384        median_df = self.df[['first_day_of_month', 'time_allocation',
1385                             'time_mobile', 'time_to_scene', 'time_on_scene',
1386                             'time_to_hospital', 'time_to_clear', 'vehicle_type']].copy()
1387
1388        median_df['total_job_time'] = median_df[[
1389            'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene',
1390            'time_to_hospital', 'time_to_clear']].sum(axis=1, skipna=True)
1391
1392        # Replacing zeros with NaN to exclude from median calculation
1393        # since if an HEMS result is Stood down en route, then time_on_scene would be zero and affect the median
1394        # median_df.replace(0, np.nan, inplace=True)
1395
1396        # Grouping by month and resource_type, calculating medians
1397        median_times = median_df.groupby(['first_day_of_month', 'vehicle_type']).median(numeric_only=True).reset_index()
1398
1399        pivot_data = median_times.pivot_table(
1400            index='first_day_of_month',
1401            columns='vehicle_type',
1402            values=['time_allocation', 'time_mobile', 'time_to_scene',
1403                    'time_on_scene', 'time_to_hospital', 'time_to_clear', 'total_job_time']
1404        )
1405
1406        pivot_data.columns = [f"median_{col[1]}_{col[0]}" for col in pivot_data.columns]
1407        pivot_data = pivot_data.reset_index()
1408
1409        pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)
1410
1411    def historical_monthly_resource_utilisation(self):
1412        """
1413            Calculates number of, and time spent on, incidents per month stratified by callsign
1414        """
1415
1416        # Multiple resources can be sent to the same job.
1417        monthly_df = self.df[[
1418            'inc_date', 'first_day_of_month', 'callsign', 'time_allocation',
1419            'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital',
1420            'time_to_clear']].copy()
1421
1422        monthly_df['total_time'] = monthly_df.filter(regex=r'^time_').sum(axis=1)
1423
1424        monthly_totals_df = monthly_df.groupby(['callsign', 'first_day_of_month'], as_index=False)\
1425            .agg(n = ('callsign', 'size'), total_time = ('total_time', 'sum'))
1426
1427        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values=['n', 'total_time'])
1428
1429        monthly_totals_pivot_df.columns = [f"{col[0]}_{col[1]}" for col in  monthly_totals_pivot_df.columns]
1430        monthly_totals_pivot_df = monthly_totals_pivot_df.reset_index()
1431
1432        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_resource_utilisation.csv', mode="w+", index=False)
1433
1434    def historical_daily_calls_breakdown(self):
1435
1436        df = self.df
1437        # Convert inc_date to date only (remove time)
1438        df['date'] = pd.to_datetime(df['inc_date']).dt.date
1439
1440        # Count number of calls per day
1441        calls_in_day_breakdown = df.groupby('date').size().reset_index(name='calls_in_day')
1442
1443        # Save the daily call counts with a 'day' index column
1444        calls_in_day_breakdown_with_day = calls_in_day_breakdown.copy()
1445        calls_in_day_breakdown_with_day.insert(0, 'day', range(1, len(calls_in_day_breakdown) + 1))
1446        calls_in_day_breakdown_with_day.drop(columns='date').to_csv('historical_data/historical_daily_calls_breakdown.csv', index=False)
1447
1448        # Count how many days had the same number of calls
1449        calls_per_day_summary = calls_in_day_breakdown['calls_in_day'].value_counts().reset_index()
1450        calls_per_day_summary.columns = ['calls_in_day', 'days']
1451        calls_per_day_summary.to_csv('historical_data/historical_daily_calls.csv', index=False)
1452
1453    def historical_missed_jobs(self):
1454        df = self.df
1455        df["date"] = pd.to_datetime(df["inc_date"])
1456        df["hour"] = df["date"].dt.hour
1457        df["month_start"] = df["date"].dt.strftime("%Y-%m-01")
1458        df["callsign_group_simplified"] = df["callsign_group"].apply(lambda x: "No HEMS available" if x=="Other" else "HEMS (helo or car) available and sent")
1459        df["quarter"] = df["inc_date"].dt.quarter
1460
1461        # By month
1462        count_df_month = df[["callsign_group_simplified", "month_start"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','month_start'])
1463        count_df_month.to_csv("historical_data/historical_missed_calls_by_month.csv", index=False)
1464
1465        # By hour
1466        count_df = df[["callsign_group_simplified", "hour"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','hour'])
1467        count_df.to_csv("historical_data/historical_missed_calls_by_hour.csv", index=False)
1468
1469        # By quarter and hour
1470        count_df_quarter = df[["callsign_group_simplified", "quarter", "hour"]].value_counts().reset_index(name="count").sort_values(['quarter','callsign_group_simplified','hour'])
1471        count_df_quarter.to_csv("historical_data/historical_missed_calls_by_quarter_and_hour.csv", index=False)
1472
1473    def upper_allowable_time_bounds(self):
1474        """
1475            Calculates the maximum permissable time for each phase on an incident based on supplied historical data.
1476            This is currently set to 1.5x the upper quartile of the data distribution
1477        """
1478
1479        median_df = self.df[[
1480            'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene',
1481            'time_to_hospital', 'time_to_clear', 'vehicle_type']]
1482
1483        # Replacing zeros with NaN to exclude from median calculation
1484        # since if an HEMS result is Stood down en route, then time_on_scene
1485        # would be zero and affect the median
1486        median_df.replace(0, np.nan, inplace=True)
1487
1488        print(median_df.quantile(.75))
1489        # pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)
1490
1491    def historical_job_durations_breakdown(self):
1492
1493        df = self.df
1494
1495        cols = [
1496            'callsign', 'vehicle_type',
1497            'time_allocation', 'time_mobile',
1498            'time_to_scene', 'time_on_scene',
1499            'time_to_hospital', 'time_to_clear'
1500        ]
1501        df2 = df[cols].copy()
1502
1503        # 2. Add a 1-based row identifier
1504        df2['job_identifier'] = range(1, len(df2) + 1)
1505
1506        # 3. Compute total_duration as the row-wise sum of the time columns
1507        time_cols = [
1508            'time_allocation', 'time_mobile',
1509            'time_to_scene', 'time_on_scene',
1510            'time_to_hospital', 'time_to_clear'
1511        ]
1512        df2['total_duration'] = df2[time_cols].sum(axis=1, skipna=True)
1513
1514        #print(df2.head())
1515
1516        # 4. Pivot (melt) to long format
1517        df_long = df2.melt(
1518            id_vars=['job_identifier', 'callsign', 'vehicle_type'],
1519            value_vars=time_cols + ['total_duration'],
1520            var_name='name',
1521            value_name='value'
1522        )
1523
1524        #print(df_long[df_long.job_identifier == 1])
1525
1526        # 5. Drop any rows where callsign or vehicle_type is missing
1527        df_long = df_long.dropna(subset=['callsign', 'vehicle_type'])
1528        df_long_sorted = df_long.sort_values("job_identifier").reset_index(drop=True)
1529
1530        # 6. Write out to CSV
1531        df_long_sorted.to_csv("historical_data/historical_job_durations_breakdown.csv", index=False)
1532
1533    # ========== ARCHIVED CODE - v1 of calcualte_availability_row ======================== #
1534    # def calculate_availability_row(self, row, rota_df, callsign_lookup_df, period_start, period_end):
1535    #     """
1536    #     Compute downtime overlap, rota-based scheduled time, and proportion for a given row.
1537    #     Returns data tagged with bin, quarter, and downtime reason.
1538    #     """
1539
1540    #     registration = row['aircraft'].lower()
1541    #     downtime_start = pd.to_datetime(row['offline'], utc=True)
1542    #     downtime_end = pd.to_datetime(row['online'], utc=True)
1543    #     reason = row.get('reason', None)
1544
1545    #     hour = downtime_start.hour
1546    #     if 0 <= hour <= 5:
1547    #         six_hour_bin = '00-05'
1548    #     elif 6 <= hour <= 11:
1549    #         six_hour_bin = '06-11'
1550    #     elif 12 <= hour <= 17:
1551    #         six_hour_bin = '12-17'
1552    #     else:
1553    #         six_hour_bin = '18-23'
1554
1555    #     quarter = downtime_start.quarter
1556
1557    #     # Match callsign
1558    #     match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
1559    #     if match.empty:
1560    #         return {
1561    #             'registration': registration,
1562    #             'offline': downtime_start,
1563    #             'online': downtime_end,
1564    #             'six_hour_bin': six_hour_bin,
1565    #             'quarter': quarter,
1566    #             'total_offline': None,
1567    #             'scheduled_minutes': None,
1568    #             'reason': reason,
1569    #             'proportion': None
1570    #         }
1571    #     callsign = match.iloc[0]['callsign']
1572
1573    #     rota_rows = rota_df[rota_df['callsign'] == callsign]
1574    #     if rota_rows.empty:
1575    #         return {
1576    #             'registration': registration,
1577    #             'offline': downtime_start,
1578    #             'online': downtime_end,
1579    #             'six_hour_bin': six_hour_bin,
1580    #             'quarter': quarter,
1581    #             'total_offline': None,
1582    #             'scheduled_minutes': None,
1583    #             'reason': reason,
1584    #             'proportion': None
1585    #         }
1586
1587    #     # Clip evaluation window to downtime period
1588    #     eval_start = max(downtime_start.normalize(), pd.to_datetime(period_start, utc=True))
1589    #     eval_end = min(downtime_end.normalize(), pd.to_datetime(period_end, utc=True))
1590
1591    #     total_scheduled_minutes = 0
1592    #     total_overlap_minutes = 0
1593
1594    #     current_day = eval_start
1595    #     while current_day <= eval_end:
1596    #         month = current_day.month
1597    #         season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter'
1598
1599    #         for _, rota in rota_rows.iterrows():
1600    #             start_hour = rota[f'{season}_start']
1601    #             end_hour = rota[f'{season}_end']
1602
1603    #             rota_start = current_day + timedelta(hours=start_hour)
1604    #             rota_end = current_day + timedelta(hours=end_hour)
1605    #             if end_hour <= start_hour:
1606    #                 rota_end += timedelta(days=1)
1607
1608    #             # Count scheduled time regardless of overlap
1609    #             scheduled_minutes = (rota_end - rota_start).total_seconds() / 60
1610    #             total_scheduled_minutes += scheduled_minutes
1611
1612    #             # Count overlap only if intersecting with downtime
1613    #             overlap_start = max(downtime_start, rota_start)
1614    #             overlap_end = min(downtime_end, rota_end)
1615    #             if overlap_end > overlap_start:
1616    #                 overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
1617    #                 total_overlap_minutes += overlap_minutes
1618
1619    #         current_day += timedelta(days=1)
1620
1621    #     if total_scheduled_minutes == 0:
1622    #         proportion = None
1623    #     else:
1624    #         proportion = total_overlap_minutes / total_scheduled_minutes
1625
1626    #     return {
1627    #         'registration': registration,
1628    #         'offline': downtime_start,
1629    #         'online': downtime_end,
1630    #         'six_hour_bin': six_hour_bin,
1631    #         'quarter': quarter,
1632    #         'total_offline': total_overlap_minutes,
1633    #         'scheduled_minutes': total_scheduled_minutes,
1634    #         'reason': reason,
1635    #         'proportion': proportion
1636    #     }
1637
1638
1639    # ============= ARCHIVED CODE - v2 of calculate_availability_row ======================= #
1640    # def calculate_availability_row(self, row, rota_df, callsign_lookup_df):
1641    #     """
1642    #         Compute downtime overlap, rota-based scheduled time, and proportion for a given row.
1643    #         Returns data tagged with bin, quarter, and downtime reason.
1644    #     """
1645
1646    #     registration = row['aircraft'].lower()
1647    #     downtime_start = pd.to_datetime(row['offline'], utc=True)
1648    #     downtime_end = pd.to_datetime(row['online'], utc=True)
1649    #     reason = row.get('reason', None)
1650
1651    #     hour = downtime_start.hour
1652    #     if 0 <= hour <= 5:
1653    #         six_hour_bin = '00-05'
1654    #     elif 6 <= hour <= 11:
1655    #         six_hour_bin = '06-11'
1656    #     elif 12 <= hour <= 17:
1657    #         six_hour_bin = '12-17'
1658    #     else:
1659    #         six_hour_bin = '18-23'
1660
1661    #     quarter = downtime_start.quarter
1662
1663    #     # Match callsign
1664    #     match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
1665    #     if match.empty:
1666    #         return {
1667    #             'registration': registration,
1668    #             'offline': downtime_start,
1669    #             'online': downtime_end,
1670    #             'six_hour_bin': six_hour_bin,
1671    #             'quarter': quarter,
1672    #             'total_offline': None,
1673    #             'scheduled_minutes': None,
1674    #             'reason': reason,
1675    #             'proportion': None
1676    #         }
1677    #     callsign = match.iloc[0]['callsign']
1678
1679    #     rota_rows = rota_df[rota_df['callsign'] == callsign]
1680    #     if rota_rows.empty:
1681    #         return {
1682    #             'registration': registration,
1683    #             'offline': downtime_start,
1684    #             'online': downtime_end,
1685    #             'six_hour_bin': six_hour_bin,
1686    #             'quarter': quarter,
1687    #             'total_offline': None,
1688    #             'scheduled_minutes': None,
1689    #             'reason': reason,
1690    #             'proportion': None
1691    #         }
1692
1693    #     month = downtime_start.month
1694    #     season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter'
1695
1696    #     total_scheduled_minutes = 0
1697    #     total_overlap_minutes = 0
1698
1699    #     for _, rota in rota_rows.iterrows():
1700    #         start_hour = rota[f'{season}_start']
1701    #         end_hour = rota[f'{season}_end']
1702
1703    #         for base_day in [downtime_start.normalize() - timedelta(days=1),
1704    #                         downtime_start.normalize(),
1705    #                         downtime_start.normalize() + timedelta(days=1)]:
1706
1707    #             rota_start = base_day + timedelta(hours=start_hour)
1708    #             rota_end = base_day + timedelta(hours=end_hour)
1709    #             if end_hour <= start_hour:
1710    #                 rota_end += timedelta(days=1)
1711
1712    #             overlap_start = max(downtime_start, rota_start)
1713    #             overlap_end = min(downtime_end, rota_end)
1714
1715    #             if overlap_end > overlap_start:
1716    #                 scheduled_minutes = (rota_end - rota_start).total_seconds() / 60
1717    #                 overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
1718
1719    #                 total_scheduled_minutes += scheduled_minutes
1720    #                 total_overlap_minutes += overlap_minutes
1721
1722    #     if total_scheduled_minutes == 0:
1723    #         proportion = None
1724    #     else:
1725    #         proportion = total_overlap_minutes / total_scheduled_minutes
1726
1727    #     return {
1728    #         'registration': registration,
1729    #         'offline': downtime_start,
1730    #         'online': downtime_end,
1731    #         'six_hour_bin': six_hour_bin,
1732    #         'quarter': quarter,
1733    #         'total_offline': total_overlap_minutes,
1734    #         'scheduled_minutes': total_scheduled_minutes,
1735    #         'reason': reason,
1736    #         'proportion': proportion
1737    #     }
1738
1739    # ================ ARCHIVED CODE - ad-hoc unavailability calculation ================ #
1740    # def ad_hoc_unavailability(self, period_start, period_end, include_debugging_cols=False):
1741    #     """Process ad hoc unavailability records into a stratified probability table.
1742
1743    #     Calculates the probability of ad hoc unavailability and availability based
1744    #     on historical data. The data is stratified by aircraft registration,
1745    #     six-hour time bins, and calendar quarters. It ensures all standard
1746    #     reasons ('available', 'crew', 'weather', 'aircraft') are present for
1747    #     each combination. It filters out any registration/quarter/bin pairings
1748    #     with little scheduled time, marking their probabilities as blank. It also adds
1749    #     a count of the ad-hoc unavailability events considered for each probability calculation
1750    #     and the total scheduled time for the resource in that quarter/time period that was part of
1751    #     the calculations.
1752
1753    #     Returns
1754    #     -------
1755    #     None
1756    #         This function does not return a value but saves the calculated
1757    #         probabilities to 'distribution_data/ad_hoc_unavailability.csv'.
1758    #         The CSV file includes columns for registration, six_hour_bin,
1759    #         quarter, reason, probability, and count.
1760
1761    #     Notes
1762    #     -----
1763    #     This function relies on the following input files:
1764    #     - 'external_data/ad_hoc.csv': Contains ad hoc unavailability records.
1765    #     - 'actual_data/HEMS_ROTA.csv': Contains rota information.
1766    #     - 'actual_data/callsign_registration_lookup.csv': Maps callsigns to registrations.
1767    #     It also depends on the 'calculate_availability_row' method within the
1768    #     same class. Ensure that the 'distribution_data' directory exists.
1769    #     """
1770    #     try:
1771    #         # Load data
1772    #         adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online'])
1773    #         adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']]
1774    #         rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv")
1775    #         callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv")
1776
1777    #         # Process each ad hoc record
1778    #         results = adhoc_df.apply(
1779    #             lambda row: self.calculate_availability_row(row, rota_df, callsign_lookup_df, period_start, period_end),
1780    #             axis=1
1781    #         )
1782    #         final_df = pd.DataFrame(results.tolist())
1783    #         final_df.to_csv("external_data/ad_hoc_intermediate.csv")
1784
1785    #         # Check if final_df is empty before proceeding
1786    #         if final_df.empty:
1787    #             print("No ad-hoc data processed, skipping file generation.")
1788    #             return
1789
1790    #         # Define the full set of reasons expected
1791    #         all_reasons = ['available', 'crew', 'weather', 'aircraft']
1792
1793    #         # --- Aggregate Data ---
1794    #         # Calculate job count per registration, quarter, AND bin
1795    #         unavailability_instance_counts = final_df.groupby(['registration', 'quarter', 'six_hour_bin']).size().reset_index(name='unavailability_instance_counts')
1796
1797    #         # Downtime by bin + quarter + reason (only for unavailability reasons)
1798    #         grouped = final_df[final_df['reason'].isin(['crew', 'weather', 'aircraft'])]
1799    #         grouped = grouped.groupby(['registration', 'six_hour_bin', 'quarter', 'reason'])['total_offline'].sum().reset_index()
1800
1801    #         # Scheduled time by bin + quarter
1802    #         scheduled_totals = final_df.groupby(['registration', 'six_hour_bin', 'quarter'])['scheduled_minutes'].sum().reset_index()
1803    #         scheduled_totals = scheduled_totals.rename(columns={'scheduled_minutes': 'total_scheduled'})
1804
1805    #         # Merge job counts into scheduled totals
1806    #         scheduled_totals = pd.merge(scheduled_totals, unavailability_instance_counts, on=['registration', 'quarter', 'six_hour_bin'], how='left')
1807
1808    #         # Calculate total downtime per bin + quarter (for 'available' calculation)
1809    #         downtime_totals = grouped.groupby(['registration','six_hour_bin', 'quarter'])['total_offline'].sum().reset_index()
1810    #         downtime_totals = downtime_totals.rename(columns={'total_offline': 'total_downtime'})
1811
1812    #         # --- Create Full Grid ---
1813    #         # Get all unique combinations of registration, quarter, and bin
1814    #         unique_bins = scheduled_totals[['registration', 'quarter', 'six_hour_bin']].drop_duplicates()
1815
1816    #         # Check for empty unique_bins
1817    #         if unique_bins.empty:
1818    #             print("No valid unique bins found, skipping file generation.")
1819    #             return
1820
1821    #         # Create the full grid by crossing unique bins with all reasons
1822    #         full_grid = unique_bins.assign(key=1).merge(pd.DataFrame({'reason': all_reasons, 'key': 1}), on='key').drop('key', axis=1)
1823
1824    #         # --- Merge Data into Full Grid ---
1825    #         full_grid = pd.merge(full_grid, scheduled_totals, on=['registration', 'quarter', 'six_hour_bin'], how='left')
1826    #         full_grid = pd.merge(full_grid, grouped, on=['registration', 'six_hour_bin', 'quarter', 'reason'], how='left')
1827    #         full_grid = pd.merge(full_grid, downtime_totals, on=['registration', 'six_hour_bin', 'quarter'], how='left')
1828
1829    #         # Fill NaNs created during merges
1830    #         full_grid['total_offline'] = full_grid['total_offline'].fillna(0)
1831    #         full_grid['total_downtime'] = full_grid['total_downtime'].fillna(0)
1832    #         full_grid['unavailability_instance_counts'] = full_grid['unavailability_instance_counts'].fillna(0) # Fill job count with 0 for bins that might exist but have no jobs
1833
1834    #         # --- Calculate Probabilities ---
1835    #         # Suppress division by zero warnings - we handle these next
1836    #         with np.errstate(divide='ignore', invalid='ignore'):
1837    #             prob_avail = (full_grid['total_scheduled'] - full_grid['total_downtime']) / full_grid['total_scheduled']
1838    #             prob_unavail = full_grid['total_offline'] / full_grid['total_scheduled']
1839
1840    #             full_grid['probability'] = np.where(
1841    #                 full_grid['reason'] == 'available',
1842    #                 prob_avail,
1843    #                 prob_unavail
1844    #             )
1845
1846    #         # Handle NaN/Inf from division by zero, set them to 0.0 for now.
1847    #         full_grid['probability'] = full_grid['probability'].replace([np.inf, -np.inf], np.nan).fillna(0.0)
1848
1849    #         # --- Apply Threshold and Blanking ---
1850    #         # Condition for setting probability to blank
1851    #         condition_for_blank = (full_grid['total_scheduled'] < 60 * 2 * 30 * 3 ) # If less than 2 hours per day in time period rotad, exclude
1852
1853    #         # Convert probability to object to allow blanks, then apply the condition
1854    #         full_grid['probability'] = full_grid['probability'].astype(object)
1855    #         full_grid.loc[condition_for_blank, 'probability'] = ''
1856
1857    #         # --- Finalize and Save ---
1858    #         # Select and rename columns
1859    #         if include_debugging_cols:
1860    #             final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability', 'unavailability_instance_counts', 'total_offline', 'total_scheduled']]
1861    #             final_prob_df.rename(columns={"total_offine":"total_minutes_offline_for_reason", "total_scheduled": "total_minutes_rotad_availability_in_quarter_and_time_period"})
1862    #         else:
1863    #             final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability']]
1864    #         # final_prob_df = final_prob_df.rename(columns={'job_count': 'count'})
1865
1866    #         # Sort and save
1867    #         final_prob_df = final_prob_df.sort_values(by=['registration', 'quarter', 'six_hour_bin', 'reason']).reset_index(drop=True)
1868    #         final_prob_df.to_csv("distribution_data/ad_hoc_unavailability.csv", index=False)
1869
1870    #         print("Ad-hoc unavailability probability table generated successfully.")
1871
1872    #     except FileNotFoundError:
1873    #         print("Couldn't generate ad-hoc unavailability due to missing file(s). "
1874    #             "Please ensure 'external_data/ad_hoc.csv', "
1875    #             "'actual_data/HEMS_ROTA.csv', and "
1876    #             "'actual_data/callsign_registration_lookup.csv' exist.")
1877    #         pass
1878    #     except Exception as e:
1879    #         print(f"An error occurred during ad-hoc unavailability processing: {e}")
1880    #         pass
1881
1882    def ad_hoc_unavailability(self, period_start, period_end):
1883        """
1884        Calculate aircraft availability and unavailability probabilities based on scheduled rotas and ad-hoc downtime.
1885
1886        Args:
1887            period_start (str/datetime): Start date for analysis period
1888            period_end (str/datetime): End date for analysis period
1889            include_debugging_cols (bool): Whether to include debugging columns in output (currently unused)
1890
1891        Returns:
1892            None (saves results to CSV file)
1893        """
1894        # Load and prepare ad-hoc downtime data
1895        adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online'])
1896        adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']]
1897        # Load rota and callsign lookup data
1898        rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv")
1899        callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv")
1900        # Merge rota with callsign lookup to get registration numbers to allow matching with
1901        # ad-hoc data, which uses registrations
1902        full_rota_df = rota_df.merge(callsign_lookup_df, on="callsign")
1903
1904        # Define the hour bands mapping
1905        HOUR_BANDS = {
1906            '00-05': (0, 6),   # 00:00 to 05:59
1907            '06-11': (6, 12),  # 06:00 to 11:59
1908            '12-17': (12, 18), # 12:00 to 17:59
1909            '18-23': (18, 24)  # 18:00 to 23:59
1910        }
1911
1912        # Create list of 6-hour bins
1913        bins = ['00-05', '06-11', '12-17', '18-23']
1914
1915        def is_summer(date_obj, summer_start_month=4, summer_end_month=9):
1916            """
1917            Determine if a date falls in summer months (April-September).
1918
1919            Args:
1920                date_obj: Date object to check
1921
1922            Returns:
1923                bool: True if date is in summer months
1924            """
1925            return date_obj.month in [i for i in range(summer_start_month, summer_end_month+1)]
1926
1927        def check_month_is_summer(month, summer_start_month=4, summer_end_month=9):
1928            """
1929            Determine if a date falls in summer months (April-September).
1930
1931            Args:
1932                month: Integer month
1933
1934            Returns:
1935                str: 'summer' is in summer months, 'winter' if in winter months
1936            """
1937            return 'summer' if month in [i for i in range(summer_start_month, summer_end_month+1)] else 'winter'
1938
1939        def get_band_for_hour(hour):
1940            """
1941            Return the 6-hour band name for a given hour (0-23).
1942
1943            Args:
1944                hour (int): Hour of day (0-23)
1945
1946            Returns:
1947                str or None: Band name or None if hour is invalid
1948            """
1949            for band_name, (start, end) in HOUR_BANDS.items():
1950                if start <= hour < end:
1951                    return band_name
1952            return None
1953
1954        def calculate_minutes_in_band(shift_start, shift_end, band_start, band_end):
1955            """
1956            Calculate how many minutes of a shift fall within a specific time band.
1957
1958            Args:
1959                shift_start (float): Shift start time in hours (0-23.99)
1960                shift_end (float): Shift end time in hours (0-23.99)
1961                band_start (int): Band start hour
1962                band_end (int): Band end hour
1963
1964            Returns:
1965                int: Minutes of overlap between shift and band
1966            """
1967            # Find the overlap between shift and band
1968            overlap_start = max(shift_start, band_start)
1969            overlap_end = min(shift_end, band_end)
1970
1971            if overlap_start < overlap_end:
1972                return int((overlap_end - overlap_start) * 60)  # Convert to minutes
1973            return 0
1974
1975        # Create date range for analysis period
1976        date_range = pd.date_range(start=period_start,
1977                            end=period_end,
1978                            freq='D')
1979        daily_df = pd.DataFrame({'date': date_range})
1980        daily_df['date'] = pd.to_datetime(daily_df['date'])
1981
1982    # Create MultiIndex from date and bins combinations to get all date/time combinations
1983        multi_index = pd.MultiIndex.from_product(
1984            [daily_df['date'], bins],
1985            names=['date', 'six_hour_bin']
1986            )
1987
1988        # Convert MultiIndex to DataFrame and reset index
1989        daily_df = multi_index.to_frame(index=False).reset_index(drop=True)
1990        # Add quarter information for seasonal analysis
1991        daily_df["quarter"] = daily_df.date.dt.quarter
1992
1993        # Initialize availability columns for each aircraft registration
1994        # Each column will store minutes of scheduled time per time band
1995        for registration in full_rota_df['registration'].unique():
1996            daily_df[registration] = 0 # Initialize with 0 minutes
1997
1998        # Calculate scheduled availability for each date/time band combination
1999        for _, row in daily_df.iterrows():
2000            current_date = row['date']
2001            current_band = row['six_hour_bin']
2002            band_start, band_end = HOUR_BANDS[current_band]
2003
2004            is_current_date_summer = is_summer(current_date)
2005
2006            # Get the row index for updating the dataframe
2007            row_idx = daily_df[(daily_df['date'] == current_date) &
2008                            (daily_df['six_hour_bin'] == current_band)].index[0]
2009
2010            # Iterate through each resource's rota entry
2011            for _, rota_entry in full_rota_df.iterrows():
2012                registration = rota_entry['registration']
2013                # Select appropriate start/end times based on season
2014                start_hour_col = 'summer_start' if is_current_date_summer else 'winter_start'
2015                end_hour_col = 'summer_end' if is_current_date_summer else 'winter_end'
2016                start_hour = rota_entry[start_hour_col]
2017                end_hour = rota_entry[end_hour_col]
2018
2019                total_minutes_for_band = 0
2020
2021                if start_hour < end_hour:
2022                    # Shift within same day
2023                    total_minutes_for_band = calculate_minutes_in_band(
2024                        start_hour, end_hour, band_start, band_end
2025                    )
2026                elif start_hour > end_hour:
2027                    # Shift spans midnight - check both parts
2028
2029                    # Part 1: Today from start_hour to midnight
2030                    if band_end <= 24:  # This band is today
2031                        total_minutes_for_band += calculate_minutes_in_band(
2032                            start_hour, 24, band_start, band_end
2033                        )
2034
2035                    # Part 2: Tomorrow from midnight to end_hour
2036                    # Need to check if this band is for tomorrow
2037                    tomorrow = current_date + pd.Timedelta(days=1)
2038                    tomorrow_rows = daily_df[daily_df['date'] == tomorrow]
2039
2040                    if not tomorrow_rows.empty and current_band in tomorrow_rows['six_hour_bin'].values:
2041                        total_minutes_for_band += calculate_minutes_in_band(
2042                            0, end_hour, band_start, band_end
2043                        )
2044
2045                # Update the scheduled time for this aircraft in this time band
2046                daily_df.loc[row_idx, registration] += total_minutes_for_band
2047
2048        # Aggregate scheduled availability by quarter, time band, and registration
2049        available_time = (
2050            daily_df.melt(id_vars=["date", "six_hour_bin", "quarter"], value_name="rota_time", var_name="registration")
2051            .groupby(["quarter", "six_hour_bin", "registration"])[["rota_time"]]
2052            .sum().reset_index()
2053            )
2054
2055        def calculate_availability_row(row, rota_df, callsign_lookup_df):
2056            """
2057            Calculate downtime overlap with scheduled rota for a single downtime event.
2058
2059            Args:
2060                row: DataFrame row containing downtime information
2061                rota_df: DataFrame with rota schedules
2062                callsign_lookup_df: DataFrame mapping callsigns to registrations
2063
2064            Returns:
2065                dict: Dictionary containing processed availability data
2066            """
2067            # Extract downtime information
2068            registration = row['aircraft'].lower()
2069            downtime_start = pd.to_datetime(row['offline'], utc=True)
2070            downtime_end = pd.to_datetime(row['online'], utc=True)
2071            reason = row.get('reason', None)
2072
2073            # Determine which 6-hour bin this downtime starts in
2074            hour = downtime_start.hour
2075            if 0 <= hour <= 5:
2076                six_hour_bin = '00-05'
2077            elif 6 <= hour <= 11:
2078                six_hour_bin = '06-11'
2079            elif 12 <= hour <= 17:
2080                six_hour_bin = '12-17'
2081            else:
2082                six_hour_bin = '18-23'
2083
2084            quarter = downtime_start.quarter
2085
2086            # Find the callsign for this registration
2087            match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
2088
2089            # No matching callsign found
2090            if match.empty:
2091                return {
2092                    'registration': registration,
2093                    'offline': downtime_start,
2094                    'online': downtime_end,
2095                    'six_hour_bin': six_hour_bin,
2096                    'quarter': quarter,
2097                    'total_offline': None,
2098                    'reason': reason
2099                }
2100
2101            callsign = match.iloc[0]['callsign']
2102            # Find rota entries for this callsign
2103            rota_rows = rota_df[rota_df['callsign'] == callsign]
2104            if rota_rows.empty:
2105                return {
2106                    'registration': registration,
2107                    'offline': downtime_start,
2108                    'online': downtime_end,
2109                    'six_hour_bin': six_hour_bin,
2110                    'quarter': quarter,
2111                    'total_offline': None,
2112                    'reason': reason
2113                }
2114
2115            # Determine season for appropriate rota times
2116            month = downtime_start.month
2117            season = check_month_is_summer(month)
2118
2119            total_overlap_minutes = 0
2120
2121            # Calculate overlap between downtime and scheduled rota times
2122            for _, rota in rota_rows.iterrows():
2123                start_hour = rota[f'{season}_start']
2124                end_hour = rota[f'{season}_end']
2125
2126                # Check overlap across multiple days (yesterday, today, tomorrow)
2127                # This handles shifts that span midnight
2128                for base_day in [downtime_start.normalize() - timedelta(days=1),
2129                                downtime_start.normalize(),
2130                                downtime_start.normalize() + timedelta(days=1)]:
2131
2132                    rota_start = base_day + timedelta(hours=start_hour)
2133                    rota_end = base_day + timedelta(hours=end_hour)
2134
2135                    # Handle shifts that cross midnight
2136                    if end_hour <= start_hour:
2137                        rota_end += timedelta(days=1)
2138                    # Calculate overlap between downtime and this rota period
2139                    overlap_start = max(downtime_start, rota_start)
2140                    overlap_end = min(downtime_end, rota_end)
2141
2142                    if overlap_end > overlap_start:
2143                        overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
2144
2145                        total_overlap_minutes += overlap_minutes
2146
2147            return {
2148                'registration': registration,
2149                'offline': downtime_start,
2150                'online': downtime_end,
2151                'six_hour_bin': six_hour_bin,
2152                'quarter': quarter,
2153                'total_offline': total_overlap_minutes,
2154                'reason': reason
2155            }
2156
2157        # Process all ad-hoc downtime events
2158        results = adhoc_df.apply(
2159                        lambda row: calculate_availability_row(
2160                            row, rota_df, callsign_lookup_df),
2161                        axis=1
2162                    )
2163
2164        # Convert results to DataFrame and select relevant columns
2165        unavailability_minutes_df = (
2166            pd.DataFrame(results.tolist())
2167            [["registration", "six_hour_bin", "quarter", "total_offline", "reason"]]
2168            )
2169
2170        # Aggregate offline time by registration, time band, quarter, and reason
2171        offline_durations = unavailability_minutes_df.groupby(
2172            ["registration", "six_hour_bin", "quarter", "reason"]
2173            )[["total_offline"]].sum().reset_index()
2174
2175        # Create complete combinations to ensure all possible categories are represented
2176        # This prevents missing data issues in the final output
2177        registrations = offline_durations['registration'].unique()
2178        six_hour_bins = offline_durations['six_hour_bin'].unique()
2179        quarters = offline_durations['quarter'].unique()
2180        reasons = offline_durations['reason'].unique()
2181
2182        # Generate all possible combinations
2183        all_combinations = list(itertools.product(registrations, six_hour_bins, quarters, reasons))
2184
2185        # Create complete dataframe with all combinations
2186        complete_df = pd.DataFrame(
2187            all_combinations,
2188            columns=['registration', 'six_hour_bin', 'quarter', 'reason']
2189            )
2190
2191        # Merge with original data to preserve existing values, fill missing with 0
2192        offline_durations = complete_df.merge(
2193            offline_durations,
2194            on=['registration', 'six_hour_bin', 'quarter', 'reason'],
2195            how='left'
2196            )
2197
2198        # Fill NaN values with 0 (no downtime for those combinations)
2199        offline_durations['total_offline'] = offline_durations['total_offline'].fillna(0.0)
2200
2201        # Sort for better readability
2202        offline_durations = offline_durations.sort_values(
2203            ['registration', 'six_hour_bin', 'quarter', 'reason']
2204            ).reset_index(drop=True)
2205
2206        # Ensure consistent case for registration names
2207        available_time["registration"] = available_time["registration"].str.lower()
2208        offline_durations["registration"] = offline_durations["registration"].str.lower()
2209
2210        # Merge scheduled time with downtime data
2211        ad_hoc = available_time.merge(offline_durations, on=["registration", "quarter", "six_hour_bin"])
2212        ad_hoc["probability"] = ad_hoc["total_offline"] / ad_hoc["rota_time"]
2213
2214        # Calculate availability probability (1 - sum of all unavailability probabilities)
2215        available_prop_df = ad_hoc.groupby(
2216            ["quarter", "six_hour_bin", "registration", "rota_time"]
2217            )[["probability"]].sum().reset_index()
2218
2219        available_prop_df["reason"] = "available"
2220        available_prop_df["probability"] = 1 - available_prop_df["probability"]
2221
2222        # Combine unavailability and availability data
2223        final_ad_hoc_df = (
2224            pd.concat([ad_hoc, available_prop_df])
2225            .sort_values(["quarter", "six_hour_bin", "registration", "reason"])
2226            )
2227
2228        # Handle cases where there's no scheduled time (set probability to NaN)
2229        final_ad_hoc_df["probability"] = final_ad_hoc_df.apply(
2230            lambda x: np.nan if x["rota_time"]==0 else x["probability"],
2231            axis=1
2232            )
2233
2234        final_ad_hoc_df["probability"] = final_ad_hoc_df["probability"].round(5)
2235
2236        # Save results to CSV
2237        final_ad_hoc_df[["registration", "six_hour_bin", "quarter", "reason", "probability"]].to_csv(
2238            "distribution_data/ad_hoc_unavailability.csv", index=False
2239            )
2240
2241
2242    def run_sim_on_historical_params(self):
2243        # Ensure all rotas are using default values
2244        rota = pd.read_csv("tests/rotas_historic/HISTORIC_HEMS_ROTA.csv")
2245        rota.to_csv("actual_data/HEMS_ROTA.csv", index=False)
2246
2247        callsign_reg_lookup = pd.read_csv("tests/rotas_historic/HISTORIC_callsign_registration_lookup.csv")
2248        callsign_reg_lookup.to_csv("actual_data/callsign_registration_lookup.csv", index=False)
2249
2250        service_history = pd.read_csv("tests/rotas_historic/HISTORIC_service_history.csv")
2251        service_history.to_csv("actual_data/service_history.csv", index=False)
2252
2253        service_sched = pd.read_csv("tests/rotas_historic/HISTORIC_service_schedules_by_model.csv")
2254        service_sched.to_csv("actual_data/service_schedules_by_model.csv", index=False)
2255
2256        print("Generating simulation results...")
2257        removeExistingResults()
2258
2259        total_runs = 30
2260        sim_years = 2
2261        sim_duration = 60 * 24 * 7 * 52 * sim_years
2262
2263        parallelProcessJoblib(
2264            total_runs=total_runs,
2265            sim_duration=sim_duration,
2266            warm_up_time=0,
2267            sim_start_date=datetime.strptime("2023-01-01 05:00:00", "%Y-%m-%d %H:%M:%S"),
2268            amb_data=False,
2269            print_debug_messages=True
2270        )
2271
2272        collateRunResults()
2273
2274        try:
2275            results_all_runs = pd.read_csv("data/run_results.csv")
2276            # results_all_runs.to_csv("historical_data/calculated/SIM_hist_params.csv", index=False)
2277
2278            # save data of counts of suboptimal care category sent
2279            counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", 'hems_res_category', "care_cat"]].value_counts().reset_index()
2280
2281            counts_df_summary = counts_df.groupby(["hems_res_category", "care_cat"])["count"].agg(["mean", "min", "max"]).reset_index()
2282
2283            counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_care_cat_sent_summary.csv")
2284
2285            # save data of counts of suboptimal vehicle type sent
2286            counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", "vehicle_type", "heli_benefit"]].value_counts().reset_index()
2287
2288            counts_df_summary = counts_df.groupby(["vehicle_type", "heli_benefit"])["count"].agg(["mean", "min", "max"]).reset_index()
2289
2290            counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_vehicle_type_sent_summary.csv")
2291
2292
2293            # # Also run the model to get some base-case outputs
2294            # resource_requests = (
2295            #     results_all_runs[results_all_runs["event_type"] == "resource_request_outcome"]
2296            #     .copy()
2297            #     )
2298
2299            # resource_requests["care_cat"] = (
2300            #     resource_requests.apply(lambda x: "REG - Helicopter Benefit" if x["heli_benefit"]=="y"
2301            #                             and x["care_cat"]=="REG" else x["care_cat"],
2302            #                             axis=1)
2303            #                             )
2304
2305            # missed_jobs_care_cat_summary = (
2306            #     resource_requests[["care_cat", "time_type"]].value_counts().reset_index(name="jobs")
2307            #     .sort_values(["care_cat", "time_type"])
2308            #     .copy()
2309            #     )
2310
2311            # missed_jobs_care_cat_summary["jobs_average"] = (
2312            #     missed_jobs_care_cat_summary["jobs"]/
2313            #     total_runs
2314            #     )
2315
2316            # missed_jobs_care_cat_summary["jobs_per_year_average"] = (
2317            #     (missed_jobs_care_cat_summary["jobs_average"] / float(sim_years*365)*365)
2318            #     ).round(0)
2319
2320            missed_jobs_care_cat_summary = _job_outcome_calculation.get_missed_call_df(
2321                    results_all_runs=results_all_runs,
2322                    run_length_days = float(sim_years*365),
2323                    what="summary"
2324                    )
2325
2326            missed_jobs_care_cat_summary.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_summary.csv")
2327
2328
2329            missed_jobs_care_cat_breakdown = _job_outcome_calculation.get_missed_call_df(
2330                    results_all_runs=results_all_runs,
2331                    run_length_days = float(sim_years*365),
2332                    what="breakdown"
2333                    )
2334
2335            missed_jobs_care_cat_breakdown.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_breakdown.csv")
2336
2337        except FileNotFoundError:
2338            pass
2339
2340if __name__ == "__main__":
2341    from distribution_fit_utils import DistributionFitUtils
2342    test = DistributionFitUtils('external_data/clean_daa_import_missing_2023_2024.csv',
2343                                calculate_school_holidays=True)
2344    #test = DistributionFitUtils('external_data/clean_daa_import.csv')
2345    test.import_and_wrangle()
2346    test.run_sim_on_historical_params()
2347
2348# Testing ----------
2349# python distribution_fit_utils.py
class DistributionFitUtils:
  22class DistributionFitUtils():
  23    """
  24        # The DistributionFitUtils classa
  25
  26        This class will import a CSV, undertake some light
  27        wrangling and then determine distributions and probabilities required
  28        for the Discrete Event Simulation
  29
  30        example usage:
  31            my_data = DistributionFitUtils('data/my_data.csv')
  32            my_data.import_and_wrangle()
  33
  34    """
  35
  36    def __init__(self, file_path: str, calculate_school_holidays = False, school_holidays_years = 0):
  37
  38        self.file_path = file_path
  39        self.df = pd.DataFrame()
  40
  41        # The number of additional years of school holidays
  42        # that will be calculated over that maximum date in the provided dataset
  43        self.school_holidays_years = school_holidays_years
  44        self.calculate_school_holidays = calculate_school_holidays
  45
  46        self.times_to_fit = [
  47            {"hems_result": "Patient Treated but not conveyed by HEMS",
  48            "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']},
  49            {"hems_result": "Patient Conveyed by HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']},
  50            {"hems_result": "Patient Conveyed by land with HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']},
  51            {"hems_result": "Stand Down" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_clear']},
  52            {"hems_result": "Landed but no patient contact" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']},
  53        ]
  54
  55        self.sim_tools_distr_plus = [
  56            "poisson",
  57            "bernoulli",
  58            "triang",
  59            "erlang",
  60            "weibull_min",
  61            "expon_weib",
  62            "betabinom",
  63            "pearson3",
  64            "cauchy",
  65            "chi2",
  66            "expon",
  67            "exponpow",
  68            "gamma",
  69            "lognorm",
  70            "norm",
  71            "powerlaw",
  72            "rayleigh",
  73            "uniform",
  74            "neg_binomial",
  75            "zip"
  76        ]
  77        # SR 16-04-2025 Have hardcoded the common distributions
  78        # to make setup for random number generation more robust
  79        #+ get_common_distributions()
  80
  81    def removeExistingResults(self, folder: str) -> None:
  82            """
  83                Removes results from previous fitting
  84            """
  85
  86            matching_files = glob.glob(os.path.join(folder, "*.*"))
  87
  88            print(matching_files)
  89
  90            for file in matching_files:
  91                os.remove(file)
  92
  93
  94    def getBestFit(self, q_times, distr=get_common_distributions(), show_summary=False):
  95        """
  96
  97            Convenience function for Fitter.
  98            Returns model and parameters that is considered
  99            the 'best fit'.
 100
 101            TODO: Determine how Fitter works this out
 102
 103        """
 104
 105        if(q_times.size > 0):
 106            if(len(distr) > 0):
 107                f = Fitter(q_times, timeout=60, distributions=distr)
 108            else:
 109                f = Fitter(q_times, timeout=60)
 110            f.fit()
 111            if show_summary == True:
 112                f.summary()
 113            return f.get_best()
 114        else:
 115            return {}
 116
 117
 118    def import_and_wrangle(self):
 119        """
 120
 121            Function to import CSV, add additional columns that are required
 122            and then sequentially execute other class functions to generate
 123            the probabilities and distributions required for the DES.
 124
 125            TODO: Additional logic is required to check the imported CSV
 126            for missing values, incorrect columns names etc.
 127
 128        """
 129
 130        try:
 131            df = pd.read_csv(self.file_path, quoting=QUOTE_ALL)
 132            self.df = df
 133
 134            # Perhaps run some kind of checking function here.
 135
 136        except FileNotFoundError:
 137            print(f"Cannot locate that file")
 138
 139        # If everything is okay, crack on...
 140        self.df['inc_date'] = pd.to_datetime(self.df['inc_date'])
 141        self.df['date_only'] = pd.to_datetime(df['inc_date'].dt.date)
 142        self.df['hour'] = self.df['inc_date'].dt.hour                      # Hour of the day
 143        self.df['day_of_week'] = self.df['inc_date'].dt.day_name()         # Day of the week (e.g., Monday)
 144        self.df['month'] = self.df['inc_date'].dt.month
 145        self.df['quarter'] = self.df['inc_date'].dt.quarter
 146        self.df['first_day_of_month'] = self.df['inc_date'].to_numpy().astype('datetime64[M]')
 147
 148        # Replacing a upper quartile limit on job cycle times and
 149        # instead using a manually specified time frame.
 150        # This has the advantage of allowing manual amendment of the falues
 151        # on the front-end
 152        # self.max_values_df = self.upper_allowable_time_bounds()
 153        self.min_max_values_df = pd.read_csv('actual_data/upper_allowable_time_bounds.csv')
 154        #print(self.min_max_values_df)
 155
 156        #This will be needed for other datasets, but has already been computed for DAA
 157        #self.df['ampds_card'] = self.df['ampds_code'].str[:2]
 158
 159        self.removeExistingResults(Utils.HISTORICAL_FOLDER)
 160        self.removeExistingResults(Utils.DISTRIBUTION_FOLDER)
 161
 162        #get proportions of AMPDS card by hour of day
 163        self.hour_by_ampds_card_probs()
 164
 165        # Determine 'best' distributions for time-based data
 166        self.activity_time_distributions()
 167
 168        # Calculate probability patient will be female based on AMPDS card
 169        self.sex_by_ampds_card_probs()
 170
 171        # Determine 'best' distributions for age ranges straitifed by AMPDS card
 172        self.age_distributions()
 173
 174        # Alternative approach to IA times. Start with probabilty of call at given hour stratified by quarter
 175        self.hourly_arrival_by_qtr_probs()
 176
 177        # Calculates the mean and standard deviation of the number of incidents per day stratified by quarter
 178        self.incidents_per_day()
 179        self.incidents_per_day_samples()
 180
 181        # Calculate probability of enhanced or critical care being required based on AMPDS card
 182        self.enhanced_or_critical_care_by_ampds_card_probs()
 183
 184        # Calculate HEMS result
 185        self.hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs()
 186
 187        # Calculate probability of callsign being allocated to a job based on AMPDS card and hour of day
 188        # self.callsign_group_by_ampds_card_and_hour_probs()
 189        # self.callsign_group_by_ampds_card_probs()
 190        # self.callsign_group_by_care_category()
 191        self.callsign_group()
 192
 193        # Calculate probability of a particular vehicle type based on callsign group and month of year
 194        # self.vehicle_type_by_month_probs()
 195        self.vehicle_type_by_quarter_probs()
 196        # self.vehicle_type_probs() # Similar to previous but without monthly stratification since ad hoc unavailability should account for this.
 197
 198        # Calculate the patient outcome (conveyed, deceased, unknown)
 199        self.patient_outcome_by_care_category_and_quarter_probs()
 200
 201        # ============= ARCHIVED CODE ================= #
 202        # Calculate the mean inter-arrival times stratified by yearly quarter and hour of day
 203        # self.inter_arrival_times()
 204        # ============= END ARCHIVED CODE ================= #
 205
 206
 207        # ============= ARCHIVED CODE ================= #
 208        # Calculate probably of patient outcome
 209        # Note - this still needs to be run to support another one?
 210        # Calculate probability of a specific patient outcome being allocated to a job based on HEMS result and callsign
 211        # self.pt_outcome_by_hems_result_and_care_category_probs()
 212        # ============= END ARCHIVED CODE ================= #
 213
 214        # ============= ARCHIVED CODE ================= #
 215        # self.hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs()
 216        # ============= END ARCHIVED CODE ================= #
 217
 218        # ============= ARCHIVED CODE ================= #
 219        # Calculate probabily of HEMS result being allocated to a job based on callsign and hour of day
 220        # self.hems_result_by_callsign_group_and_vehicle_type_probs()
 221        # ============= END ARCHIVED CODE ================= #
 222
 223        # ============= ARCHIVED CODE ================= #
 224        # Calculate probability of HEMS result being allocated to a job based on care category and helicopter benefit
 225        # self.hems_result_by_care_cat_and_helicopter_benefit_probs()
 226        # ============= END ARCHIVED CODE ================= #
 227
 228
 229        # Calculate school holidays since servicing schedules typically avoid these dates
 230        if self.calculate_school_holidays:
 231            self.school_holidays()
 232
 233        # Calculate historical data
 234        self.historical_monthly_totals()
 235        self.historical_monthly_totals_by_callsign()
 236        self.historical_monthly_totals_by_day_of_week()
 237        self.historical_median_time_of_activities_by_month_and_resource_type()
 238        self.historical_monthly_totals_by_hour_of_day()
 239        self.historical_monthly_resource_utilisation()
 240        self.historical_monthly_totals_all_calls()
 241        self.historical_daily_calls_breakdown()
 242        self.historical_job_durations_breakdown()
 243        self.historical_missed_jobs()
 244        self.historical_jobs_per_day_per_callsign()
 245        self.historical_care_cat_counts()
 246
 247        # Calculate proportions of ad hoc unavailability
 248        try:
 249            # self.ad_hoc_unavailability()
 250            self.ad_hoc_unavailability(period_start="2022-08-01", period_end="2024-07-31")
 251        except FileNotFoundError:
 252            print("Couldn't find ad-hoc unavailability file")
 253
 254    def hour_by_ampds_card_probs(self):
 255        """
 256
 257            Calculates the proportions of calls that are triaged with
 258            a specific AMPDS card. This is stratified by hour of day
 259
 260            TODO: Determine whether this should also be stratified by yearly quarter
 261
 262        """
 263        category_counts = self.df.groupby(['hour', 'ampds_card']).size().reset_index(name='count')
 264        total_counts = category_counts.groupby('hour')['count'].transform('sum')
 265        category_counts['proportion'] = round(category_counts['count'] / total_counts, 4)
 266
 267        #category_counts['ampds_card'] = category_counts['ampds_card'].apply(lambda x: str(x).zfill(2))
 268
 269        category_counts.to_csv('distribution_data/hour_by_ampds_card_probs.csv', mode="w+")
 270
 271
 272    def sex_by_ampds_card_probs(self):
 273        """
 274
 275            Calculates the probability that the patient will be female
 276            stratified by AMPDS card.
 277
 278        """
 279        age_df = self.df
 280        category_counts = age_df.groupby(['ampds_card', 'sex']).size().reset_index(name='count')
 281        total_counts = category_counts.groupby('ampds_card')['count'].transform('sum')
 282        category_counts['proportion'] = round(category_counts['count'] / total_counts, 3)
 283
 284        category_counts[category_counts['sex'] =='Female'].to_csv('distribution_data/sex_by_ampds_card_probs.csv', mode="w+")
 285
 286    def activity_time_distributions(self):
 287        """
 288
 289            Determine the 'best' distribution for each phase of a call
 290            i.e. Allocation time, Mobilisation time, Time to scene
 291            Time on scene, Travel time to hospital and handover, Time to clear.
 292            Not all times will apply to all cases, so the class 'times_to_fit'
 293            variable is a list of dictionaries, which contains the times to fit
 294
 295            The data is currently stratitied by HEMS_result and vehicle type fields.
 296
 297        """
 298
 299        vehicle_type = self.df['vehicle_type'].dropna().unique()
 300
 301        # We'll need to make sure that where a distribution is missing that the time is set to 0 in the model.
 302        # Probably easier than complicated logic to determine what times should be available based on hems_result
 303
 304        final_distr = []
 305
 306        for row in self.times_to_fit:
 307            #print(row)
 308            for ttf in row['times_to_fit']:
 309                for vt in vehicle_type:
 310                    #print(f"HEMS result is {row['hems_result']} times_to_fit is {ttf} and vehicle type is  {vt}")
 311
 312                    # This line might not be required if data quality is determined when importing the data
 313                    max_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].max_value_mins.iloc[0]
 314                    min_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].min_value_mins.iloc[0]
 315
 316                    #print(f"Max time is {max_time} and Min time is {min_time}")
 317
 318                    if ttf == 'time_on_scene':
 319                        # There is virtually no data for HEMS_result other than patient conveyed
 320                        # which is causing issues with fitting. For time on scene, will
 321                        # use a simplified fitting ignoring hems_result as a category
 322                        fit_times = self.df[
 323                            (self.df.vehicle_type == vt) &
 324                            (self.df[ttf] >= min_time) &
 325                            (self.df[ttf] <= max_time)
 326                        ][ttf]
 327                    else:
 328                        fit_times = self.df[
 329                            (self.df.vehicle_type == vt) &
 330                            (self.df[ttf] >= min_time) &
 331                            (self.df[ttf] <= max_time) &
 332                            (self.df.hems_result == row['hems_result'])
 333                        ][ttf]
 334                    #print(fit_times[:10])
 335                    best_fit = self.getBestFit(fit_times, distr=self.sim_tools_distr_plus)
 336                    #print(best_fit)
 337
 338                    return_dict = { "vehicle_type": vt, "time_type" : ttf, "best_fit": best_fit, "hems_result": row['hems_result'], "n": len(fit_times)}
 339                    #print(return_dict)
 340                    final_distr.append(return_dict)
 341
 342        with open('distribution_data/activity_time_distributions.txt', 'w+') as convert_file:
 343            convert_file.write(json.dumps(final_distr))
 344        convert_file.close()
 345
 346
 347    def age_distributions(self):
 348        """
 349
 350            Determine the 'best' distribution for age stratified by
 351            AMPDS card
 352
 353        """
 354
 355        age_distr = []
 356
 357        age_df = self.df[["age", "ampds_card"]].dropna()
 358        ampds_cards = age_df['ampds_card'].unique()
 359        print(ampds_cards)
 360
 361        for card in ampds_cards:
 362            fit_ages = age_df[age_df['ampds_card'] == card]['age']
 363            best_fit = self.getBestFit(fit_ages, distr=self.sim_tools_distr_plus)
 364            return_dict = { "ampds_card": str(card), "best_fit": best_fit, "n": len(fit_ages)}
 365            age_distr.append(return_dict)
 366
 367        with open('distribution_data/age_distributions.txt', 'w+') as convert_file:
 368            convert_file.write(json.dumps(age_distr))
 369        convert_file.close()
 370
 371
 372    # def inter_arrival_times(self):
 373    #     """
 374
 375    #         Calculate the mean inter-arrival times for patients
 376    #         stratified by hour, and and yearly quarter
 377
 378    #     """
 379
 380    #     ia_df = self.df[['date_only', 'quarter', 'hour']].dropna()
 381
 382    #     count_df = ia_df.groupby(['hour', 'date_only', 'quarter']).size().reset_index(name='n')
 383
 384    #     ia_times_df = (
 385    #         count_df.groupby(['hour', 'quarter'])
 386    #         .agg(
 387    #             # max_arrivals_per_hour=('n', lambda x: round(60 / np.max(x), 3)),
 388    #             # min_arrivals_per_hour=('n', lambda x: round(60 / np.min(x),3)),
 389    #             mean_cases=('n', lambda x: round(x.mean(), 1)),
 390    #             # sd_cases=('n', lambda x: round(x.std(), 3)),
 391    #             mean_iat=('n', lambda x: 60 / x.mean())
 392    #             # n=('n', 'size')
 393    #         )
 394    #         .reset_index()
 395    #     )
 396    #     # Additional column for NSPPThinning
 397    #     ia_times_df['t'] = ia_times_df['hour']
 398    #     ia_times_df['arrival_rate'] = ia_times_df['mean_iat'].apply(lambda x: 1/x)
 399
 400    #     ia_times_df.to_csv('distribution_data/inter_arrival_times.csv', mode='w+')
 401
 402
 403    def incidents_per_day(self):
 404        """
 405        Fit distributions for number of incidents per day using actual daily counts,
 406        applying year-based weighting to reflect trends (e.g., 2024 busier than 2023),
 407        stratified by season and quarter.
 408        """
 409        import math
 410        import json
 411        import numpy as np
 412
 413        inc_df = self.df[['inc_date', 'date_only', 'quarter']].copy()
 414        inc_df['year'] = inc_df['date_only'].dt.year
 415
 416        # Daily incident counts
 417        inc_per_day = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day')
 418        inc_per_day['year'] = inc_per_day['date_only'].dt.year
 419
 420        # Merge quarter and season from self.df
 421        date_info = self.df[['date_only', 'quarter']].drop_duplicates()
 422
 423        if 'season' not in self.df.columns:
 424            date_info['season'] = date_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 425        else:
 426            date_info = date_info.merge(
 427                self.df[['date_only', 'season']].drop_duplicates(),
 428                on='date_only',
 429                how='left'
 430            )
 431
 432        inc_per_day = inc_per_day.merge(date_info, on='date_only', how='left')
 433
 434        # Weight settings - simple implementation rather than biased mean thing
 435        year_weights = {
 436            2023: 1.0,
 437            2024: 4.0  # 10% more weight to 2024
 438        }
 439
 440        # ========== SEASONAL DISTRIBUTIONS ==========
 441        jpd_distr = []
 442
 443        for season in inc_per_day['season'].dropna().unique():
 444            filtered = inc_per_day[inc_per_day['season'] == season].copy()
 445            filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0)
 446
 447            # Repeat rows proportionally by weight
 448            replicated = filtered.loc[
 449                filtered.index.repeat((filtered['weight'] * 10).round().astype(int))
 450            ]['jobs_per_day']
 451
 452            best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus)
 453
 454            jpd_distr.append({
 455                "season": season,
 456                "best_fit": best_fit,
 457                "min_n_per_day": int(replicated.min()),
 458                "max_n_per_day": int(replicated.max()),
 459                "mean_n_per_day": float(replicated.mean())
 460            })
 461
 462        with open('distribution_data/inc_per_day_distributions.txt', 'w+') as f:
 463            json.dump(jpd_distr, f)
 464
 465        # ========== QUARTERLY DISTRIBUTIONS ==========
 466        jpd_qtr_distr = []
 467
 468        for quarter in inc_per_day['quarter'].dropna().unique():
 469            filtered = inc_per_day[inc_per_day['quarter'] == quarter].copy()
 470            filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0)
 471
 472            replicated = filtered.loc[
 473                filtered.index.repeat((filtered['weight'] * 10).round().astype(int))
 474            ]['jobs_per_day']
 475
 476            best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus)
 477
 478            jpd_qtr_distr.append({
 479                "quarter": int(quarter),
 480                "best_fit": best_fit,
 481                "min_n_per_day": int(replicated.min()),
 482                "max_n_per_day": int(replicated.max()),
 483                "mean_n_per_day": float(replicated.mean())
 484            })
 485
 486        with open('distribution_data/inc_per_day_qtr_distributions.txt', 'w+') as f:
 487            json.dump(jpd_qtr_distr, f)
 488
 489    def incidents_per_day_samples(self, weight_map=None, scale_factor=10):
 490        """
 491            Create weighted empirical samples of incidents per day by season and quarter.
 492        """
 493
 494        inc_df = self.df[['date_only', 'quarter']].copy()
 495        inc_df['year'] = inc_df['date_only'].dt.year
 496        inc_df['season'] = inc_df['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 497
 498        # Get raw counts per day
 499        daily_counts = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day')
 500        daily_counts['year'] = daily_counts['date_only'].dt.year
 501
 502        # Merge back in season/quarter info
 503        meta_info = self.df[['date_only', 'quarter']].drop_duplicates()
 504        if 'season' in self.df.columns:
 505            meta_info = meta_info.merge(
 506                self.df[['date_only', 'season']].drop_duplicates(),
 507                on='date_only', how='left'
 508            )
 509        else:
 510            meta_info['season'] = meta_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
 511
 512        daily_counts = daily_counts.merge(meta_info, on='date_only', how='left')
 513
 514        # Year weight map
 515        if weight_map is None:
 516            weight_map = {2023: 1.0, 2024: 1.1}
 517
 518        # Compute weights
 519        daily_counts['weight'] = daily_counts['year'].map(weight_map).fillna(1.0)
 520
 521        # Storage
 522        empirical_samples = {}
 523
 524        # Season-based
 525        for season in daily_counts['season'].dropna().unique():
 526            filtered = daily_counts[daily_counts['season'] == season].copy()
 527            repeated = filtered.loc[
 528                filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int))
 529            ]['jobs_per_day'].tolist()
 530
 531            empirical_samples[season] = repeated
 532
 533        # Quarter-based
 534        for quarter in daily_counts['quarter'].dropna().unique():
 535            filtered = daily_counts[daily_counts['quarter'] == quarter].copy()
 536            repeated = filtered.loc[
 537                filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int))
 538            ]['jobs_per_day'].tolist()
 539
 540            empirical_samples[f"Q{int(quarter)}"] = repeated
 541
 542        with open("distribution_data/inc_per_day_samples.json", 'w') as f:
 543            json.dump(empirical_samples, f)
 544
 545
 546    def enhanced_or_critical_care_by_ampds_card_probs(self):
 547        """
 548
 549            Calculates the probabilty of enhanced or critical care resource beign required
 550            based on the AMPDS card
 551
 552        """
 553
 554        ec_df = self.df[['ampds_card', 'ec_benefit', 'cc_benefit']].copy()
 555
 556        def assign_care_category(row):
 557            # There are some columns with both EC and CC benefit selected
 558            # this function will allocate to only 1
 559            if row['cc_benefit'] == 'y':
 560                return 'CC'
 561            elif row['ec_benefit'] == 'y':
 562                return 'EC'
 563            else:
 564                return 'REG'
 565
 566        ec_df['care_category'] = ec_df.apply(assign_care_category, axis = 1)
 567
 568        care_cat_counts = ec_df.groupby(['ampds_card', 'care_category']).size().reset_index(name='count')
 569        total_counts = care_cat_counts.groupby('ampds_card')['count'].transform('sum')
 570
 571        care_cat_counts['proportion'] = round(care_cat_counts['count'] / total_counts, 3)
 572
 573        care_cat_counts.to_csv('distribution_data/enhanced_or_critical_care_by_ampds_card_probs.csv', mode = "w+", index = False)
 574
 575    def patient_outcome_by_care_category_and_quarter_probs(self):
 576        """
 577
 578            Calculates the probabilty of a patient outcome based on care category and yearly quarter
 579
 580        """
 581
 582        po_df = self.df[['quarter', 'ec_benefit', 'cc_benefit', 'pt_outcome']].copy()
 583
 584        def assign_care_category(row):
 585            # There are some columns with both EC and CC benefit selected
 586            # this function will allocate to only 1
 587            if row['cc_benefit'] == 'y':
 588                return 'CC'
 589            elif row['ec_benefit'] == 'y':
 590                return 'EC'
 591            else:
 592                return 'REG'
 593
 594        # There are some values that are missing e.g. CC quarter 1 Deceased
 595        # I think we've had problems when trying to sample from this kind of thing before
 596        # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 597        outcomes = po_df['pt_outcome'].unique()
 598        care_categories = ['CC', 'EC', 'REG']
 599        quarters = po_df['quarter'].unique()
 600
 601        all_combinations = pd.DataFrame(list(itertools.product(outcomes, care_categories, quarters)),
 602                                    columns=['pt_outcome', 'care_category', 'quarter'])
 603
 604        po_df['care_category'] = po_df.apply(assign_care_category, axis = 1)
 605
 606        po_cat_counts = po_df.groupby(['pt_outcome', 'care_category', 'quarter']).size().reset_index(name='count')
 607
 608        merged = pd.merge(all_combinations, po_cat_counts,
 609                      on=['pt_outcome', 'care_category', 'quarter'],
 610                      how='left').fillna({'count': 0})
 611        merged['count'] = merged['count'].astype(int)
 612
 613        total_counts = merged.groupby(['care_category', 'quarter'])['count'].transform('sum')
 614        merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 615
 616        merged.to_csv('distribution_data/patient_outcome_by_care_category_and_quarter_probs.csv', mode = "w+", index = False)
 617
 618    # def hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs(self):
 619    #     """
 620
 621    #         Calculates the probabilty of a given HEMS result based on
 622    #         patient outcome, yearly quarter, vehicle type and callsign group
 623
 624    #     """
 625
 626    #     hr_df = self.df[['hems_result', 'quarter', 'pt_outcome', 'vehicle_type', 'callsign_group']].copy()
 627
 628    #     # There are some values that are missing e.g. CC quarter 1 Deceased
 629    #     # I think we've had problems when trying to sample from this kind of thing before
 630    #     # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 631    #     # hems_results = hr_df['hems_result'].unique()
 632    #     # outcomes = hr_df['pt_outcome'].unique()
 633    #     # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)]
 634    #     # callsign_group_categories = hr_df['callsign_group'].unique()
 635    #     # quarters = hr_df['quarter'].unique()
 636
 637    #     # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)),
 638    #     #                             columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])
 639
 640    #     hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']).size().reset_index(name='count')
 641
 642    #     # merged = pd.merge(all_combinations, hr_cat_counts,
 643    #     #               on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'],
 644    #     #               how='left').fillna({'count': 0})
 645    #     # merged['count'] = merged['count'].astype(int)
 646
 647    #     merged = hr_cat_counts
 648
 649    #     total_counts = merged.groupby(['pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])['count'].transform('sum')
 650    #     merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 651
 652    #     merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False)
 653
 654    def hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs(self):
 655        """
 656
 657            Calculates the probabilty of a given HEMS result based on
 658                - patient outcome
 659                - yearly quarter
 660                - time of day (7am - 6pm, 7pm - 6am)
 661                - vehicle type
 662                - and callsign group
 663
 664        """
 665        self.df['inc_date'] = pd.to_datetime(self.df['inc_date'])
 666        self.df['hour'] = self.df['inc_date'].dt.hour
 667        self.df['time_of_day'] = self.df['hour'].apply(lambda x: 'day' if x >= 7 and x <= 18 else "night")
 668
 669        hr_df = self.df[[
 670            'hems_result', 'quarter', 'pt_outcome',
 671            'vehicle_type', 'callsign_group', 'time_of_day'
 672            ]].copy()
 673
 674        # There are some values that are missing e.g. CC quarter 1 Deceased
 675        # I think we've had problems when trying to sample from this kind of thing before
 676        # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
 677        # hems_results = hr_df['hems_result'].unique()
 678        # outcomes = hr_df['pt_outcome'].unique()
 679        # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)]
 680        # callsign_group_categories = hr_df['callsign_group'].unique()
 681        # quarters = hr_df['quarter'].unique()
 682
 683        # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)),
 684        #                             columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])
 685
 686        hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome',
 687                                       'vehicle_type', 'callsign_group',
 688                                       'quarter', 'time_of_day']).size().reset_index(name='count')
 689
 690        # merged = pd.merge(all_combinations, hr_cat_counts,
 691        #               on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'],
 692        #               how='left').fillna({'count': 0})
 693        # merged['count'] = merged['count'].astype(int)
 694
 695        merged = hr_cat_counts
 696
 697        total_counts = merged.groupby(['pt_outcome',
 698                                       'vehicle_type', 'callsign_group',
 699                                       'quarter', 'time_of_day'])['count'].transform('sum')
 700        merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
 701
 702        merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False)
 703
 704
 705
 706    def hourly_arrival_by_qtr_probs(self):
 707        """
 708
 709            Calculates the proportions of calls arriving in any given hour
 710            stratified by yearly quarter
 711
 712        """
 713
 714        ia_df = self.df[['quarter', 'hour']].dropna()
 715
 716        hourly_counts = ia_df.groupby(['hour', 'quarter']).size().reset_index(name='count')
 717        total_counts = hourly_counts.groupby(['quarter'])['count'].transform('sum')
 718        hourly_counts['proportion'] = round(hourly_counts['count'] / total_counts, 4)
 719
 720        hourly_counts.sort_values(by=['quarter', 'hour']).to_csv('distribution_data/hourly_arrival_by_qtr_probs.csv', mode="w+")
 721
 722
 723    def callsign_group_by_ampds_card_and_hour_probs(self):
 724        """
 725
 726            Calculates the probabilty of a specific callsign being allocated to
 727            a call based on the AMPDS card category and hour of day
 728
 729        """
 730        callsign_counts = self.df.groupby(['ampds_card', 'hour', 'callsign_group']).size().reset_index(name='count')
 731
 732        total_counts = callsign_counts.groupby(['ampds_card', 'hour'])['count'].transform('sum')
 733        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 734
 735        callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_and_hour_probs.csv', mode = "w+", index=False)
 736
 737    def callsign_group_by_ampds_card_probs(self):
 738        """
 739
 740            Calculates the probabilty of a specific callsign being allocated to
 741            a call based on the AMPDS card category
 742
 743        """
 744
 745        callsign_df = self.df[self.df['callsign_group'] != 'Other']
 746
 747        callsign_counts = callsign_df.groupby(['ampds_card', 'callsign_group']).size().reset_index(name='count')
 748
 749        total_counts = callsign_counts.groupby(['ampds_card'])['count'].transform('sum')
 750        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 751
 752        callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_probs.csv', mode = "w+", index=False)
 753
 754    def callsign_group(self):
 755        """
 756            Calculates the probabilty of a specific callsign being allocated to
 757            a call
 758        """
 759        df = self.df.copy()
 760
 761        # Convert time fields to numeric
 762        time_fields = [
 763            "time_allocation", "time_mobile", "time_to_scene",
 764            "time_on_scene", "time_to_hospital", "time_to_clear"
 765        ]
 766        for col in time_fields:
 767            df[col] = pd.to_numeric(df[col], errors="coerce")
 768
 769        # Calculate total job duration in minutes
 770        df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True)
 771
 772        # Compute job start and end times
 773        df["start_time"] = df["inc_date"]
 774        df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m")
 775
 776        # Sort jobs by start time
 777        df = df.sort_values(by="start_time").reset_index(drop=True)
 778
 779        # Set to hold indices of jobs that overlap (but only the later-starting ones)
 780        overlapping = set()
 781
 782        # Check for overlaps
 783        for i in range(len(df)):
 784            this_end = df.at[i, "end_time"]
 785
 786            # Compare only to later jobs
 787            for j in range(i + 1, len(df)):
 788                next_start = df.at[j, "start_time"]
 789                if next_start >= this_end:
 790                    break  # No more possible overlaps
 791                # If it starts before i's job ends, it's overlapping
 792                overlapping.add(j)
 793
 794        # Mark the overlaps in the dataframe
 795        df["overlaps"] = df.index.isin(overlapping)
 796
 797        # Filter out overlapping jobs
 798        df_no_overlap = df[~df["overlaps"]]
 799
 800        # We will use the ad-hoc unavailability to remove any instances where we already know one of
 801        # the vehicles to be recorded as offline
 802
 803        data = df_no_overlap.copy()
 804
 805        # TODO: Ideally we'd also remove any instances where we know one of the helos to have been
 806        # off for servicing if that data is available
 807        ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"])
 808        ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower()
 809
 810        data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601")
 811        data["vehicle"] = data["vehicle"].str.lower()
 812
 813        # Create a cross-join between data and ad_hoc
 814        data['key'] = 1
 815        ad_hoc['key'] = 1
 816        merged = data.merge(ad_hoc, on='key')
 817
 818        # Keep rows where inc_date falls within the offline period
 819        overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])]
 820
 821        # Filter out those rows from the original data
 822        df_no_overlap = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key')
 823
 824        callsign_df = (
 825            df_no_overlap
 826            .assign(
 827                helicopter_benefit=np.select(
 828                    [
 829                        df_no_overlap["cc_benefit"] == "y",
 830                        df_no_overlap["ec_benefit"] == "y",
 831                        df_no_overlap["hems_result"].isin([
 832                            "Stand Down En Route",
 833                            "Landed but no patient contact",
 834                            "Stand Down Before Mobile"
 835                        ])
 836                    ],
 837                    ["y", "y", "n"],
 838                    default=df_no_overlap["helicopter_benefit"]
 839                ),
 840                care_category=np.select(
 841                    [
 842                        df_no_overlap["cc_benefit"] == "y",
 843                        df_no_overlap["ec_benefit"] == "y"
 844                    ],
 845                    ["CC", "EC"],
 846                    default="REG"
 847                )
 848            )
 849        )
 850
 851        callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other']
 852
 853        callsign_counts = callsign_df.groupby(['callsign_group']).size().reset_index(name='count')
 854
 855        total_counts = len(callsign_df)
 856        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 857
 858        callsign_counts.to_csv('distribution_data/callsign_group_probs.csv', mode = "w+", index=False)
 859
 860
 861    # def callsign_group_by_care_category(self):
 862    #     """
 863
 864    #         Calculates the probabilty of a specific callsign being allocated to
 865    #         a call based on the care category
 866
 867    #     """
 868    #     df = self.df.copy()
 869
 870    #     # Convert time fields to numeric
 871    #     time_fields = [
 872    #         "time_allocation", "time_mobile", "time_to_scene",
 873    #         "time_on_scene", "time_to_hospital", "time_to_clear"
 874    #     ]
 875    #     for col in time_fields:
 876    #         df[col] = pd.to_numeric(df[col], errors="coerce")
 877
 878    #     # Calculate total job duration in minutes
 879    #     df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True)
 880
 881    #     # Compute job start and end times
 882    #     df["start_time"] = df["inc_date"]
 883    #     df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m")
 884
 885    #     # Sort jobs by start time
 886    #     df = df.sort_values(by="start_time").reset_index(drop=True)
 887
 888    #     # Set to hold indices of jobs that overlap (but only the later-starting ones)
 889    #     overlapping = set()
 890
 891    #     # Check for overlaps
 892    #     for i in range(len(df)):
 893    #         this_end = df.at[i, "end_time"]
 894
 895    #         # Compare only to later jobs
 896    #         for j in range(i + 1, len(df)):
 897    #             next_start = df.at[j, "start_time"]
 898    #             if next_start >= this_end:
 899    #                 break  # No more possible overlaps
 900    #             # If it starts before i's job ends, it's overlapping
 901    #             overlapping.add(j)
 902
 903    #     # Mark the overlaps in the dataframe
 904    #     df["overlaps"] = df.index.isin(overlapping)
 905
 906    #     # Filter out overlapping jobs
 907    #     df_no_overlap = df[~df["overlaps"]]
 908
 909
 910    #     callsign_df = (
 911    #         df_no_overlap
 912    #         .assign(
 913    #             helicopter_benefit=np.select(
 914    #                 [
 915    #                     df_no_overlap["cc_benefit"] == "y",
 916    #                     df_no_overlap["ec_benefit"] == "y",
 917    #                     df_no_overlap["hems_result"].isin([
 918    #                         "Stand Down En Route",
 919    #                         "Landed but no patient contact",
 920    #                         "Stand Down Before Mobile"
 921    #                     ])
 922    #                 ],
 923    #                 ["y", "y", "n"],
 924    #                 default=df_no_overlap["helicopter_benefit"]
 925    #             ),
 926    #             care_category=np.select(
 927    #                 [
 928    #                     df_no_overlap["cc_benefit"] == "y",
 929    #                     df_no_overlap["ec_benefit"] == "y"
 930    #                 ],
 931    #                 ["CC", "EC"],
 932    #                 default="REG"
 933    #             )
 934    #         )
 935    #     )
 936
 937    #     callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other']
 938
 939    #     callsign_counts = callsign_df.groupby(['care_category', 'callsign_group']).size().reset_index(name='count')
 940
 941    #     total_counts = callsign_counts.groupby(['care_category'])['count'].transform('sum')
 942    #     callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 943
 944    #     callsign_counts.to_csv('distribution_data/callsign_group_by_care_category_probs.csv', mode = "w+", index=False)
 945
 946    #========== ARCHIVED CODE ============ #
 947    # def vehicle_type_by_month_probs(self):
 948    #     """
 949
 950    #         Calculates the probabilty of a car/helicopter being allocated to
 951    #         a call based on the callsign group and month of the year
 952
 953    #     """
 954    #     callsign_counts = self.df.groupby(['callsign_group', 'month', 'vehicle_type']).size().reset_index(name='count')
 955
 956    #     total_counts = callsign_counts.groupby(['callsign_group', 'month'])['count'].transform('sum')
 957    #     callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 958
 959    #     callsign_counts.to_csv('distribution_data/vehicle_type_by_month_probs.csv', mode = "w+")
 960    #========== END ARCHIVED CODE ============ #
 961    def vehicle_type_by_quarter_probs(self):
 962        """
 963
 964            Calculates the probabilty of a car/helicopter being allocated to
 965            a call based on the callsign group and quarter of the year
 966
 967            Quarter accounts for seasonal variation without being as affected by
 968
 969        """
 970        data = self.df.copy()
 971
 972        # We will use the ad-hoc unavailability to remove any instances where we already know one of
 973        # the vehicles to be recorded as offline
 974
 975        # TODO: Ideally we'd also remove any instances where we know one of the helos to have been
 976        # off for servicing if that data is available
 977        ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"])
 978        ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower()
 979
 980        data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601")
 981        data["vehicle"] = data["vehicle"].str.lower()
 982
 983        # Create a cross-join between data and ad_hoc
 984        data['key'] = 1
 985        ad_hoc['key'] = 1
 986        merged = data.merge(ad_hoc, on='key')
 987
 988        # Keep rows where inc_date falls within the offline period
 989        overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])]
 990
 991        # Filter out those rows from the original data
 992        filtered_data = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key')
 993
 994        # First, calculate overall props
 995        callsign_counts = filtered_data.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count')
 996
 997        total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum')
 998        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 999
1000        callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")
1001
1002        # Then, redo by quarter
1003        callsign_counts = filtered_data.groupby(['callsign_group', 'quarter', 'vehicle_type']).size().reset_index(name='count')
1004
1005        total_counts = callsign_counts.groupby(['callsign_group', 'quarter'])['count'].transform('sum')
1006        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
1007
1008        callsign_counts.to_csv('distribution_data/vehicle_type_by_quarter_probs.csv', mode = "w+")
1009
1010    def vehicle_type_probs(self):
1011        """
1012
1013            Calculates the probabilty of a car/helicopter being allocated to
1014            a call based on the callsign group
1015
1016        """
1017
1018        callsign_counts = self.df.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count')
1019
1020        total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum')
1021        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
1022
1023        callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")
1024
1025    def hems_result_by_callsign_group_and_vehicle_type_probs(self):
1026        """
1027
1028            Calculates the probabilty of a specific HEMS result being allocated to
1029            a call based on the callsign group and hour of day
1030
1031            TODO: These probability calculation functions could probably be refactored into a single
1032            function and just specify columns and output name
1033
1034        """
1035        hems_counts = self.df.groupby(['hems_result', 'callsign_group', 'vehicle_type']).size().reset_index(name='count')
1036
1037        total_counts = hems_counts.groupby(['callsign_group', 'vehicle_type'])['count'].transform('sum')
1038        hems_counts['proportion'] = round(hems_counts['count'] / total_counts, 4)
1039
1040        hems_counts.to_csv('distribution_data/hems_result_by_callsign_group_and_vehicle_type_probs.csv', mode = "w+", index=False)
1041
1042
1043    #========== ARCHIVED CODE ============ #
1044    # def hems_result_by_care_cat_and_helicopter_benefit_probs(self):
1045    #     """
1046
1047    #         Calculates the probabilty of a specific HEMS result being allocated to
1048    #         a call based on the care category amd whether a helicopter is beneficial
1049
1050    #     """
1051
1052    #     # Wrangle the data...trying numpy for a change
1053
1054    #     hems_df = (
1055    #         self.df
1056    #         .assign(
1057    #             helicopter_benefit=np.select(
1058    #                 [
1059    #                     self.df["cc_benefit"] == "y",
1060    #                     self.df["ec_benefit"] == "y",
1061    #                     self.df["hems_result"].isin([
1062    #                         "Stand Down En Route",
1063    #                         "Landed but no patient contact",
1064    #                         "Stand Down Before Mobile"
1065    #                     ])
1066    #                 ],
1067    #                 ["y", "y", "n"],
1068    #                 default=self.df["helicopter_benefit"]
1069    #             ),
1070    #             care_cat=np.select(
1071    #                 [
1072    #                     self.df["cc_benefit"] == "y",
1073    #                     self.df["ec_benefit"] == "y"
1074    #                 ],
1075    #                 ["CC", "EC"],
1076    #                 default="REG"
1077    #             )
1078    #         )
1079    #     )
1080
1081    #     hems_counts = hems_df.groupby(['hems_result', 'care_cat', 'helicopter_benefit']).size().reset_index(name='count')
1082
1083    #     hems_counts['total'] = hems_counts.groupby(['care_cat', 'helicopter_benefit'])['count'].transform('sum')
1084    #     hems_counts['proportion'] = round(hems_counts['count'] / hems_counts['total'], 4)
1085
1086    #     hems_counts.to_csv('distribution_data/hems_result_by_care_cat_and_helicopter_benefit_probs.csv', mode = "w+", index=False)
1087    #========== END ARCHIVED CODE ============ #
1088
1089    #========== ARCHIVED CODE ============ #
1090    # def pt_outcome_by_hems_result_and_care_category_probs(self):
1091    #     """
1092
1093    #         Calculates the probabilty of a specific patient outcome based on HEMS result
1094
1095    #     """
1096
1097    #     hems_df = (
1098    #         self.df
1099    #         .assign(
1100    #             helicopter_benefit=np.select(
1101    #                 [
1102    #                     self.df["cc_benefit"] == "y",
1103    #                     self.df["ec_benefit"] == "y",
1104    #                     self.df["hems_result"].isin([
1105    #                         "Stand Down En Route",
1106    #                         "Landed but no patient contact",
1107    #                         "Stand Down Before Mobile"
1108    #                     ])
1109    #                 ],
1110    #                 ["y", "y", "n"],
1111    #                 default=self.df["helicopter_benefit"]
1112    #             ),
1113    #             care_category=np.select(
1114    #                 [
1115    #                     self.df["cc_benefit"] == "y",
1116    #                     self.df["ec_benefit"] == "y"
1117    #                 ],
1118    #                 ["CC", "EC"],
1119    #                 default="REG"
1120    #             )
1121    #         )
1122    #     )
1123
1124    #     po_counts = hems_df.groupby(['pt_outcome', 'hems_result', 'care_category']).size().reset_index(name='count')
1125
1126    #     po_counts['total'] = po_counts.groupby(['hems_result', 'care_category'])['count'].transform('sum')
1127    #     po_counts['proportion'] = round(po_counts['count'] / po_counts['total'], 4)
1128
1129    #     po_counts.to_csv('distribution_data/pt_outcome_by_hems_result_and_care_category_probs.csv', mode = "w+")
1130    #========== END ARCHIVED CODE ============ #
1131
1132    def school_holidays(self) -> None:
1133        """"
1134            Function to generate a CSV file containing schoole holiday
1135            start and end dates for a given year. The Year range is determined
1136            by the submitted data (plus a year at the end of the study for good measure)
1137        """
1138
1139        min_date = self.df.inc_date.min()
1140        max_date = self.df.inc_date.max() + timedelta(weeks = (52 * self.school_holidays_years))
1141
1142        u = Utils()
1143
1144        years_of_holidays_list = u.years_between(min_date, max_date)
1145
1146        sh = pd.DataFrame(columns=['year', 'start_date', 'end_date'])
1147
1148        for i, year in enumerate(years_of_holidays_list):
1149            tmp = u.calculate_term_holidays(year)
1150
1151            if i == 0:
1152                sh = tmp
1153            else:
1154                sh = pd.concat([sh, tmp])
1155
1156        sh.to_csv('actual_data/school_holidays.csv', index = False)
1157
1158
1159# These functions are to wrangle historical data to provide comparison against the simulation outputs
1160
1161    def historical_jobs_per_day_per_callsign(self):
1162        df = self.df
1163
1164        df["date"] = pd.to_datetime(df["inc_date"]).dt.date
1165        all_counts_hist = df.groupby(["date", "callsign"])["job_id"].count().reset_index()
1166        all_counts_hist.rename(columns={'job_id':'jobs_in_day'}, inplace=True)
1167
1168        all_combinations = pd.DataFrame(
1169            list(itertools.product(df['date'].unique(), df['callsign'].unique())),
1170            columns=['date', 'callsign']
1171        ).dropna()
1172
1173        merged = all_combinations.merge(all_counts_hist, on=['date', 'callsign'], how='left')
1174        merged['jobs_in_day'] = merged['jobs_in_day'].fillna(0).astype(int)
1175
1176        all_counts = merged.groupby(['callsign', 'jobs_in_day']).count().reset_index().rename(columns={"date":"count"})
1177        all_counts.to_csv("historical_data/historical_jobs_per_day_per_callsign.csv", index=False)
1178
1179    def historical_care_cat_counts(self):
1180        """
1181        Process historical incident data to categorize care types and compute hourly counts.
1182
1183        This method performs the following steps:
1184        - Converts incident dates to datetime format.
1185        - Extracts month start and hour from the incident date.
1186        - Categorizes each incident into care categories based on benefit flags and attendance.
1187        - Counts the number of incidents by hour and care category.
1188        - Outputs these counts to a CSV file.
1189        - Computes and writes the proportion of regular care jobs with a helicopter benefit
1190        (excluding those not attended by a DAA resource) to a text file.
1191
1192        Outputs:
1193        - CSV file: 'historical_data/historical_care_cat_counts.csv'
1194        - Text file: 'distribution_data/proportion_jobs_heli_benefit.txt'
1195        """
1196
1197        df_historical = self.df
1198
1199        df_historical['inc_date'] = pd.to_datetime(df_historical['inc_date'])
1200        # Extract the first day of the month and the hour of each incident
1201        df_historical['month_start'] = df_historical.inc_date.dt.strftime("%Y-%m-01")
1202        df_historical['hour'] = df_historical.inc_date.dt.hour
1203
1204        conditions = [
1205            df_historical['cc_benefit'] == 'y',
1206            df_historical['ec_benefit'] == 'y',
1207            df_historical['helicopter_benefit'] == 'y',
1208            df_historical['callsign_group'] == 'Other'
1209        ]
1210
1211        choices = [
1212            'CC',
1213            'EC',
1214            'REG - helicopter benefit',
1215            'Unknown - DAA resource did not attend'
1216        ]
1217        # Assign care category to each record
1218        # If the case did not meet any of the criteria in 'conditions', it will default
1219        # to being labelled as a 'regular/REG' case (i.e there was no benefit recorded)
1220        df_historical['care_category'] = np.select(conditions, choices, default='REG')
1221
1222        # Count occurrences grouped by hour and care category
1223        historical_value_counts_by_hour = (
1224            df_historical.value_counts(["hour", "care_category"])
1225            .reset_index(name="count")
1226            )
1227        # Output to CSV for use in tests and visualisations
1228        (historical_value_counts_by_hour
1229         .sort_values(['hour', 'care_category'])
1230         .to_csv("historical_data/historical_care_cat_counts.csv"))
1231
1232        # Also output the % of regular (not cc/ec) jobs with a helicopter benefit
1233        # These are the regular jobs we will make an assumption follow different logic due to having an obvious expected
1234        # patient benefit of having a helicopter allocated to them that we will have to assume is apparent at the time
1235        # of the call being placed (such as the casualty being located in a remote location, or )
1236
1237        numerator = (historical_value_counts_by_hour[
1238            historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit"
1239            ]["count"].sum())
1240
1241        denominator = (historical_value_counts_by_hour[
1242            (historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit") |
1243            (historical_value_counts_by_hour["care_category"] == "REG")
1244            ]["count"].sum())
1245
1246        with open('distribution_data/proportion_jobs_heli_benefit.txt', 'w+') as heli_benefit_file:
1247            heli_benefit_file.write(json.dumps((numerator/denominator).round(4)))
1248
1249         # Count occurrences grouped by hour and care category
1250        historical_value_counts_by_hour_cc_ec = (
1251            df_historical.value_counts(["hour", "care_category", "helicopter_benefit"])
1252            .reset_index(name="count")
1253            )
1254
1255       # Output to CSV for use in tests and visualisations
1256        # (historical_value_counts_by_hour_cc_ec
1257        #  .sort_values(["hour", "care_category", "helicopter_benefit"])
1258        #  .to_csv("historical_data/historical_care_cat_counts_cc_ec.csv"))
1259
1260
1261        numerator_cc = (
1262            historical_value_counts_by_hour_cc_ec[
1263                (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") &
1264                (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y")
1265                ]["count"].sum())
1266
1267        denominator_cc = (
1268            historical_value_counts_by_hour_cc_ec[
1269                (historical_value_counts_by_hour_cc_ec["care_category"] == "CC")
1270                ]["count"].sum())
1271
1272        with open('distribution_data/proportion_jobs_heli_benefit_cc.txt', 'w+') as heli_benefit_file:
1273            heli_benefit_file.write(json.dumps((numerator_cc/denominator_cc).round(4)))
1274
1275        numerator_ec = (
1276            historical_value_counts_by_hour_cc_ec[
1277                (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") &
1278                (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y")
1279                ]["count"].sum())
1280
1281        denominator_ec = (
1282            historical_value_counts_by_hour_cc_ec[
1283                (historical_value_counts_by_hour_cc_ec["care_category"] == "EC")
1284                ]["count"].sum())
1285
1286        with open('distribution_data/proportion_jobs_heli_benefit_ec.txt', 'w+') as heli_benefit_file:
1287            heli_benefit_file.write(json.dumps((numerator_ec/denominator_ec).round(4)))
1288
1289
1290
1291    def historical_monthly_totals(self):
1292        """
1293            Calculates monthly incident totals from provided dataset of historical data
1294        """
1295
1296        # Multiple resources can be sent to the same job.
1297        monthly_df = self.df[['inc_date', 'first_day_of_month', 'hems_result', 'vehicle_type']]\
1298            .drop_duplicates(subset="inc_date", keep="first")
1299
1300        is_stand_down = monthly_df['hems_result'].str.contains("Stand Down")
1301        monthly_df['stand_down_car'] = ((monthly_df['vehicle_type'] == "car") & is_stand_down).astype(int)
1302        monthly_df['stand_down_helicopter'] = ((monthly_df['vehicle_type'] == "helicopter") & is_stand_down).astype(int)
1303
1304        monthly_totals_df = monthly_df.groupby('first_day_of_month').agg(
1305                                stand_down_car=('stand_down_car', 'sum'),
1306                                stand_down_helicopter=('stand_down_helicopter', 'sum'),
1307                                total_jobs=('vehicle_type', 'size')
1308                            ).reset_index()
1309
1310        monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_jobs_per_month.csv', mode="w+", index=False)
1311
1312    def historical_monthly_totals_all_calls(self):
1313        """
1314            Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1315        """
1316
1317        # Multiple resources can be sent to the same job.
1318        monthly_df = self.df[['inc_date', 'first_day_of_month']].dropna()
1319
1320        monthly_totals_df = monthly_df.groupby(['first_day_of_month']).count().reset_index()
1321
1322        monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_all_calls.csv', mode="w+", index=False)
1323
1324    def historical_monthly_totals_by_callsign(self):
1325        """
1326            Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1327        """
1328
1329        # Multiple resources can be sent to the same job.
1330        monthly_df = self.df[['inc_date', 'first_day_of_month', 'callsign']].dropna()
1331
1332        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'callsign']).count().reset_index()
1333
1334        #print(monthly_totals_df.head())
1335
1336        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1337
1338        #print(monthly_totals_pivot_df.head())
1339
1340        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_callsign.csv', mode="w+", index=False)
1341
1342    def historical_monthly_totals_by_hour_of_day(self):
1343        """
1344            Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day
1345        """
1346
1347        # Multiple resources can be sent to the same job.
1348        monthly_df = self.df[['inc_date', 'first_day_of_month', 'hour']].dropna()\
1349            .drop_duplicates(subset="inc_date", keep="first")
1350
1351        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'hour']).count().reset_index()
1352
1353        #print(monthly_totals_df.head())
1354
1355        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='hour', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1356
1357        #print(monthly_totals_pivot_df.head())
1358
1359        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_hour_of_day.csv', mode="w+", index=False)
1360
1361    def historical_monthly_totals_by_day_of_week(self):
1362        """
1363            Calculates number of incidents per month stratified by day of the week
1364        """
1365
1366        # Multiple resources can be sent to the same job.
1367        monthly_df = self.df[['inc_date', 'first_day_of_month', 'day_of_week']].dropna()\
1368            .drop_duplicates(subset="inc_date", keep="first")
1369
1370        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'day_of_week']).count().reset_index()
1371
1372        #print(monthly_totals_df.head())
1373
1374        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='day_of_week', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1375
1376        #print(monthly_totals_pivot_df.head())
1377
1378        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_day_of_week.csv', mode="w+", index=False)
1379
1380    def historical_median_time_of_activities_by_month_and_resource_type(self):
1381        """
1382            Calculate the median time for each of the job cycle phases stratified by month and vehicle type
1383        """
1384
1385        median_df = self.df[['first_day_of_month', 'time_allocation',
1386                             'time_mobile', 'time_to_scene', 'time_on_scene',
1387                             'time_to_hospital', 'time_to_clear', 'vehicle_type']].copy()
1388
1389        median_df['total_job_time'] = median_df[[
1390            'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene',
1391            'time_to_hospital', 'time_to_clear']].sum(axis=1, skipna=True)
1392
1393        # Replacing zeros with NaN to exclude from median calculation
1394        # since if an HEMS result is Stood down en route, then time_on_scene would be zero and affect the median
1395        # median_df.replace(0, np.nan, inplace=True)
1396
1397        # Grouping by month and resource_type, calculating medians
1398        median_times = median_df.groupby(['first_day_of_month', 'vehicle_type']).median(numeric_only=True).reset_index()
1399
1400        pivot_data = median_times.pivot_table(
1401            index='first_day_of_month',
1402            columns='vehicle_type',
1403            values=['time_allocation', 'time_mobile', 'time_to_scene',
1404                    'time_on_scene', 'time_to_hospital', 'time_to_clear', 'total_job_time']
1405        )
1406
1407        pivot_data.columns = [f"median_{col[1]}_{col[0]}" for col in pivot_data.columns]
1408        pivot_data = pivot_data.reset_index()
1409
1410        pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)
1411
1412    def historical_monthly_resource_utilisation(self):
1413        """
1414            Calculates number of, and time spent on, incidents per month stratified by callsign
1415        """
1416
1417        # Multiple resources can be sent to the same job.
1418        monthly_df = self.df[[
1419            'inc_date', 'first_day_of_month', 'callsign', 'time_allocation',
1420            'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital',
1421            'time_to_clear']].copy()
1422
1423        monthly_df['total_time'] = monthly_df.filter(regex=r'^time_').sum(axis=1)
1424
1425        monthly_totals_df = monthly_df.groupby(['callsign', 'first_day_of_month'], as_index=False)\
1426            .agg(n = ('callsign', 'size'), total_time = ('total_time', 'sum'))
1427
1428        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values=['n', 'total_time'])
1429
1430        monthly_totals_pivot_df.columns = [f"{col[0]}_{col[1]}" for col in  monthly_totals_pivot_df.columns]
1431        monthly_totals_pivot_df = monthly_totals_pivot_df.reset_index()
1432
1433        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_resource_utilisation.csv', mode="w+", index=False)
1434
1435    def historical_daily_calls_breakdown(self):
1436
1437        df = self.df
1438        # Convert inc_date to date only (remove time)
1439        df['date'] = pd.to_datetime(df['inc_date']).dt.date
1440
1441        # Count number of calls per day
1442        calls_in_day_breakdown = df.groupby('date').size().reset_index(name='calls_in_day')
1443
1444        # Save the daily call counts with a 'day' index column
1445        calls_in_day_breakdown_with_day = calls_in_day_breakdown.copy()
1446        calls_in_day_breakdown_with_day.insert(0, 'day', range(1, len(calls_in_day_breakdown) + 1))
1447        calls_in_day_breakdown_with_day.drop(columns='date').to_csv('historical_data/historical_daily_calls_breakdown.csv', index=False)
1448
1449        # Count how many days had the same number of calls
1450        calls_per_day_summary = calls_in_day_breakdown['calls_in_day'].value_counts().reset_index()
1451        calls_per_day_summary.columns = ['calls_in_day', 'days']
1452        calls_per_day_summary.to_csv('historical_data/historical_daily_calls.csv', index=False)
1453
1454    def historical_missed_jobs(self):
1455        df = self.df
1456        df["date"] = pd.to_datetime(df["inc_date"])
1457        df["hour"] = df["date"].dt.hour
1458        df["month_start"] = df["date"].dt.strftime("%Y-%m-01")
1459        df["callsign_group_simplified"] = df["callsign_group"].apply(lambda x: "No HEMS available" if x=="Other" else "HEMS (helo or car) available and sent")
1460        df["quarter"] = df["inc_date"].dt.quarter
1461
1462        # By month
1463        count_df_month = df[["callsign_group_simplified", "month_start"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','month_start'])
1464        count_df_month.to_csv("historical_data/historical_missed_calls_by_month.csv", index=False)
1465
1466        # By hour
1467        count_df = df[["callsign_group_simplified", "hour"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','hour'])
1468        count_df.to_csv("historical_data/historical_missed_calls_by_hour.csv", index=False)
1469
1470        # By quarter and hour
1471        count_df_quarter = df[["callsign_group_simplified", "quarter", "hour"]].value_counts().reset_index(name="count").sort_values(['quarter','callsign_group_simplified','hour'])
1472        count_df_quarter.to_csv("historical_data/historical_missed_calls_by_quarter_and_hour.csv", index=False)
1473
1474    def upper_allowable_time_bounds(self):
1475        """
1476            Calculates the maximum permissable time for each phase on an incident based on supplied historical data.
1477            This is currently set to 1.5x the upper quartile of the data distribution
1478        """
1479
1480        median_df = self.df[[
1481            'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene',
1482            'time_to_hospital', 'time_to_clear', 'vehicle_type']]
1483
1484        # Replacing zeros with NaN to exclude from median calculation
1485        # since if an HEMS result is Stood down en route, then time_on_scene
1486        # would be zero and affect the median
1487        median_df.replace(0, np.nan, inplace=True)
1488
1489        print(median_df.quantile(.75))
1490        # pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)
1491
1492    def historical_job_durations_breakdown(self):
1493
1494        df = self.df
1495
1496        cols = [
1497            'callsign', 'vehicle_type',
1498            'time_allocation', 'time_mobile',
1499            'time_to_scene', 'time_on_scene',
1500            'time_to_hospital', 'time_to_clear'
1501        ]
1502        df2 = df[cols].copy()
1503
1504        # 2. Add a 1-based row identifier
1505        df2['job_identifier'] = range(1, len(df2) + 1)
1506
1507        # 3. Compute total_duration as the row-wise sum of the time columns
1508        time_cols = [
1509            'time_allocation', 'time_mobile',
1510            'time_to_scene', 'time_on_scene',
1511            'time_to_hospital', 'time_to_clear'
1512        ]
1513        df2['total_duration'] = df2[time_cols].sum(axis=1, skipna=True)
1514
1515        #print(df2.head())
1516
1517        # 4. Pivot (melt) to long format
1518        df_long = df2.melt(
1519            id_vars=['job_identifier', 'callsign', 'vehicle_type'],
1520            value_vars=time_cols + ['total_duration'],
1521            var_name='name',
1522            value_name='value'
1523        )
1524
1525        #print(df_long[df_long.job_identifier == 1])
1526
1527        # 5. Drop any rows where callsign or vehicle_type is missing
1528        df_long = df_long.dropna(subset=['callsign', 'vehicle_type'])
1529        df_long_sorted = df_long.sort_values("job_identifier").reset_index(drop=True)
1530
1531        # 6. Write out to CSV
1532        df_long_sorted.to_csv("historical_data/historical_job_durations_breakdown.csv", index=False)
1533
1534    # ========== ARCHIVED CODE - v1 of calcualte_availability_row ======================== #
1535    # def calculate_availability_row(self, row, rota_df, callsign_lookup_df, period_start, period_end):
1536    #     """
1537    #     Compute downtime overlap, rota-based scheduled time, and proportion for a given row.
1538    #     Returns data tagged with bin, quarter, and downtime reason.
1539    #     """
1540
1541    #     registration = row['aircraft'].lower()
1542    #     downtime_start = pd.to_datetime(row['offline'], utc=True)
1543    #     downtime_end = pd.to_datetime(row['online'], utc=True)
1544    #     reason = row.get('reason', None)
1545
1546    #     hour = downtime_start.hour
1547    #     if 0 <= hour <= 5:
1548    #         six_hour_bin = '00-05'
1549    #     elif 6 <= hour <= 11:
1550    #         six_hour_bin = '06-11'
1551    #     elif 12 <= hour <= 17:
1552    #         six_hour_bin = '12-17'
1553    #     else:
1554    #         six_hour_bin = '18-23'
1555
1556    #     quarter = downtime_start.quarter
1557
1558    #     # Match callsign
1559    #     match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
1560    #     if match.empty:
1561    #         return {
1562    #             'registration': registration,
1563    #             'offline': downtime_start,
1564    #             'online': downtime_end,
1565    #             'six_hour_bin': six_hour_bin,
1566    #             'quarter': quarter,
1567    #             'total_offline': None,
1568    #             'scheduled_minutes': None,
1569    #             'reason': reason,
1570    #             'proportion': None
1571    #         }
1572    #     callsign = match.iloc[0]['callsign']
1573
1574    #     rota_rows = rota_df[rota_df['callsign'] == callsign]
1575    #     if rota_rows.empty:
1576    #         return {
1577    #             'registration': registration,
1578    #             'offline': downtime_start,
1579    #             'online': downtime_end,
1580    #             'six_hour_bin': six_hour_bin,
1581    #             'quarter': quarter,
1582    #             'total_offline': None,
1583    #             'scheduled_minutes': None,
1584    #             'reason': reason,
1585    #             'proportion': None
1586    #         }
1587
1588    #     # Clip evaluation window to downtime period
1589    #     eval_start = max(downtime_start.normalize(), pd.to_datetime(period_start, utc=True))
1590    #     eval_end = min(downtime_end.normalize(), pd.to_datetime(period_end, utc=True))
1591
1592    #     total_scheduled_minutes = 0
1593    #     total_overlap_minutes = 0
1594
1595    #     current_day = eval_start
1596    #     while current_day <= eval_end:
1597    #         month = current_day.month
1598    #         season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter'
1599
1600    #         for _, rota in rota_rows.iterrows():
1601    #             start_hour = rota[f'{season}_start']
1602    #             end_hour = rota[f'{season}_end']
1603
1604    #             rota_start = current_day + timedelta(hours=start_hour)
1605    #             rota_end = current_day + timedelta(hours=end_hour)
1606    #             if end_hour <= start_hour:
1607    #                 rota_end += timedelta(days=1)
1608
1609    #             # Count scheduled time regardless of overlap
1610    #             scheduled_minutes = (rota_end - rota_start).total_seconds() / 60
1611    #             total_scheduled_minutes += scheduled_minutes
1612
1613    #             # Count overlap only if intersecting with downtime
1614    #             overlap_start = max(downtime_start, rota_start)
1615    #             overlap_end = min(downtime_end, rota_end)
1616    #             if overlap_end > overlap_start:
1617    #                 overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
1618    #                 total_overlap_minutes += overlap_minutes
1619
1620    #         current_day += timedelta(days=1)
1621
1622    #     if total_scheduled_minutes == 0:
1623    #         proportion = None
1624    #     else:
1625    #         proportion = total_overlap_minutes / total_scheduled_minutes
1626
1627    #     return {
1628    #         'registration': registration,
1629    #         'offline': downtime_start,
1630    #         'online': downtime_end,
1631    #         'six_hour_bin': six_hour_bin,
1632    #         'quarter': quarter,
1633    #         'total_offline': total_overlap_minutes,
1634    #         'scheduled_minutes': total_scheduled_minutes,
1635    #         'reason': reason,
1636    #         'proportion': proportion
1637    #     }
1638
1639
1640    # ============= ARCHIVED CODE - v2 of calculate_availability_row ======================= #
1641    # def calculate_availability_row(self, row, rota_df, callsign_lookup_df):
1642    #     """
1643    #         Compute downtime overlap, rota-based scheduled time, and proportion for a given row.
1644    #         Returns data tagged with bin, quarter, and downtime reason.
1645    #     """
1646
1647    #     registration = row['aircraft'].lower()
1648    #     downtime_start = pd.to_datetime(row['offline'], utc=True)
1649    #     downtime_end = pd.to_datetime(row['online'], utc=True)
1650    #     reason = row.get('reason', None)
1651
1652    #     hour = downtime_start.hour
1653    #     if 0 <= hour <= 5:
1654    #         six_hour_bin = '00-05'
1655    #     elif 6 <= hour <= 11:
1656    #         six_hour_bin = '06-11'
1657    #     elif 12 <= hour <= 17:
1658    #         six_hour_bin = '12-17'
1659    #     else:
1660    #         six_hour_bin = '18-23'
1661
1662    #     quarter = downtime_start.quarter
1663
1664    #     # Match callsign
1665    #     match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
1666    #     if match.empty:
1667    #         return {
1668    #             'registration': registration,
1669    #             'offline': downtime_start,
1670    #             'online': downtime_end,
1671    #             'six_hour_bin': six_hour_bin,
1672    #             'quarter': quarter,
1673    #             'total_offline': None,
1674    #             'scheduled_minutes': None,
1675    #             'reason': reason,
1676    #             'proportion': None
1677    #         }
1678    #     callsign = match.iloc[0]['callsign']
1679
1680    #     rota_rows = rota_df[rota_df['callsign'] == callsign]
1681    #     if rota_rows.empty:
1682    #         return {
1683    #             'registration': registration,
1684    #             'offline': downtime_start,
1685    #             'online': downtime_end,
1686    #             'six_hour_bin': six_hour_bin,
1687    #             'quarter': quarter,
1688    #             'total_offline': None,
1689    #             'scheduled_minutes': None,
1690    #             'reason': reason,
1691    #             'proportion': None
1692    #         }
1693
1694    #     month = downtime_start.month
1695    #     season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter'
1696
1697    #     total_scheduled_minutes = 0
1698    #     total_overlap_minutes = 0
1699
1700    #     for _, rota in rota_rows.iterrows():
1701    #         start_hour = rota[f'{season}_start']
1702    #         end_hour = rota[f'{season}_end']
1703
1704    #         for base_day in [downtime_start.normalize() - timedelta(days=1),
1705    #                         downtime_start.normalize(),
1706    #                         downtime_start.normalize() + timedelta(days=1)]:
1707
1708    #             rota_start = base_day + timedelta(hours=start_hour)
1709    #             rota_end = base_day + timedelta(hours=end_hour)
1710    #             if end_hour <= start_hour:
1711    #                 rota_end += timedelta(days=1)
1712
1713    #             overlap_start = max(downtime_start, rota_start)
1714    #             overlap_end = min(downtime_end, rota_end)
1715
1716    #             if overlap_end > overlap_start:
1717    #                 scheduled_minutes = (rota_end - rota_start).total_seconds() / 60
1718    #                 overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
1719
1720    #                 total_scheduled_minutes += scheduled_minutes
1721    #                 total_overlap_minutes += overlap_minutes
1722
1723    #     if total_scheduled_minutes == 0:
1724    #         proportion = None
1725    #     else:
1726    #         proportion = total_overlap_minutes / total_scheduled_minutes
1727
1728    #     return {
1729    #         'registration': registration,
1730    #         'offline': downtime_start,
1731    #         'online': downtime_end,
1732    #         'six_hour_bin': six_hour_bin,
1733    #         'quarter': quarter,
1734    #         'total_offline': total_overlap_minutes,
1735    #         'scheduled_minutes': total_scheduled_minutes,
1736    #         'reason': reason,
1737    #         'proportion': proportion
1738    #     }
1739
1740    # ================ ARCHIVED CODE - ad-hoc unavailability calculation ================ #
1741    # def ad_hoc_unavailability(self, period_start, period_end, include_debugging_cols=False):
1742    #     """Process ad hoc unavailability records into a stratified probability table.
1743
1744    #     Calculates the probability of ad hoc unavailability and availability based
1745    #     on historical data. The data is stratified by aircraft registration,
1746    #     six-hour time bins, and calendar quarters. It ensures all standard
1747    #     reasons ('available', 'crew', 'weather', 'aircraft') are present for
1748    #     each combination. It filters out any registration/quarter/bin pairings
1749    #     with little scheduled time, marking their probabilities as blank. It also adds
1750    #     a count of the ad-hoc unavailability events considered for each probability calculation
1751    #     and the total scheduled time for the resource in that quarter/time period that was part of
1752    #     the calculations.
1753
1754    #     Returns
1755    #     -------
1756    #     None
1757    #         This function does not return a value but saves the calculated
1758    #         probabilities to 'distribution_data/ad_hoc_unavailability.csv'.
1759    #         The CSV file includes columns for registration, six_hour_bin,
1760    #         quarter, reason, probability, and count.
1761
1762    #     Notes
1763    #     -----
1764    #     This function relies on the following input files:
1765    #     - 'external_data/ad_hoc.csv': Contains ad hoc unavailability records.
1766    #     - 'actual_data/HEMS_ROTA.csv': Contains rota information.
1767    #     - 'actual_data/callsign_registration_lookup.csv': Maps callsigns to registrations.
1768    #     It also depends on the 'calculate_availability_row' method within the
1769    #     same class. Ensure that the 'distribution_data' directory exists.
1770    #     """
1771    #     try:
1772    #         # Load data
1773    #         adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online'])
1774    #         adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']]
1775    #         rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv")
1776    #         callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv")
1777
1778    #         # Process each ad hoc record
1779    #         results = adhoc_df.apply(
1780    #             lambda row: self.calculate_availability_row(row, rota_df, callsign_lookup_df, period_start, period_end),
1781    #             axis=1
1782    #         )
1783    #         final_df = pd.DataFrame(results.tolist())
1784    #         final_df.to_csv("external_data/ad_hoc_intermediate.csv")
1785
1786    #         # Check if final_df is empty before proceeding
1787    #         if final_df.empty:
1788    #             print("No ad-hoc data processed, skipping file generation.")
1789    #             return
1790
1791    #         # Define the full set of reasons expected
1792    #         all_reasons = ['available', 'crew', 'weather', 'aircraft']
1793
1794    #         # --- Aggregate Data ---
1795    #         # Calculate job count per registration, quarter, AND bin
1796    #         unavailability_instance_counts = final_df.groupby(['registration', 'quarter', 'six_hour_bin']).size().reset_index(name='unavailability_instance_counts')
1797
1798    #         # Downtime by bin + quarter + reason (only for unavailability reasons)
1799    #         grouped = final_df[final_df['reason'].isin(['crew', 'weather', 'aircraft'])]
1800    #         grouped = grouped.groupby(['registration', 'six_hour_bin', 'quarter', 'reason'])['total_offline'].sum().reset_index()
1801
1802    #         # Scheduled time by bin + quarter
1803    #         scheduled_totals = final_df.groupby(['registration', 'six_hour_bin', 'quarter'])['scheduled_minutes'].sum().reset_index()
1804    #         scheduled_totals = scheduled_totals.rename(columns={'scheduled_minutes': 'total_scheduled'})
1805
1806    #         # Merge job counts into scheduled totals
1807    #         scheduled_totals = pd.merge(scheduled_totals, unavailability_instance_counts, on=['registration', 'quarter', 'six_hour_bin'], how='left')
1808
1809    #         # Calculate total downtime per bin + quarter (for 'available' calculation)
1810    #         downtime_totals = grouped.groupby(['registration','six_hour_bin', 'quarter'])['total_offline'].sum().reset_index()
1811    #         downtime_totals = downtime_totals.rename(columns={'total_offline': 'total_downtime'})
1812
1813    #         # --- Create Full Grid ---
1814    #         # Get all unique combinations of registration, quarter, and bin
1815    #         unique_bins = scheduled_totals[['registration', 'quarter', 'six_hour_bin']].drop_duplicates()
1816
1817    #         # Check for empty unique_bins
1818    #         if unique_bins.empty:
1819    #             print("No valid unique bins found, skipping file generation.")
1820    #             return
1821
1822    #         # Create the full grid by crossing unique bins with all reasons
1823    #         full_grid = unique_bins.assign(key=1).merge(pd.DataFrame({'reason': all_reasons, 'key': 1}), on='key').drop('key', axis=1)
1824
1825    #         # --- Merge Data into Full Grid ---
1826    #         full_grid = pd.merge(full_grid, scheduled_totals, on=['registration', 'quarter', 'six_hour_bin'], how='left')
1827    #         full_grid = pd.merge(full_grid, grouped, on=['registration', 'six_hour_bin', 'quarter', 'reason'], how='left')
1828    #         full_grid = pd.merge(full_grid, downtime_totals, on=['registration', 'six_hour_bin', 'quarter'], how='left')
1829
1830    #         # Fill NaNs created during merges
1831    #         full_grid['total_offline'] = full_grid['total_offline'].fillna(0)
1832    #         full_grid['total_downtime'] = full_grid['total_downtime'].fillna(0)
1833    #         full_grid['unavailability_instance_counts'] = full_grid['unavailability_instance_counts'].fillna(0) # Fill job count with 0 for bins that might exist but have no jobs
1834
1835    #         # --- Calculate Probabilities ---
1836    #         # Suppress division by zero warnings - we handle these next
1837    #         with np.errstate(divide='ignore', invalid='ignore'):
1838    #             prob_avail = (full_grid['total_scheduled'] - full_grid['total_downtime']) / full_grid['total_scheduled']
1839    #             prob_unavail = full_grid['total_offline'] / full_grid['total_scheduled']
1840
1841    #             full_grid['probability'] = np.where(
1842    #                 full_grid['reason'] == 'available',
1843    #                 prob_avail,
1844    #                 prob_unavail
1845    #             )
1846
1847    #         # Handle NaN/Inf from division by zero, set them to 0.0 for now.
1848    #         full_grid['probability'] = full_grid['probability'].replace([np.inf, -np.inf], np.nan).fillna(0.0)
1849
1850    #         # --- Apply Threshold and Blanking ---
1851    #         # Condition for setting probability to blank
1852    #         condition_for_blank = (full_grid['total_scheduled'] < 60 * 2 * 30 * 3 ) # If less than 2 hours per day in time period rotad, exclude
1853
1854    #         # Convert probability to object to allow blanks, then apply the condition
1855    #         full_grid['probability'] = full_grid['probability'].astype(object)
1856    #         full_grid.loc[condition_for_blank, 'probability'] = ''
1857
1858    #         # --- Finalize and Save ---
1859    #         # Select and rename columns
1860    #         if include_debugging_cols:
1861    #             final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability', 'unavailability_instance_counts', 'total_offline', 'total_scheduled']]
1862    #             final_prob_df.rename(columns={"total_offine":"total_minutes_offline_for_reason", "total_scheduled": "total_minutes_rotad_availability_in_quarter_and_time_period"})
1863    #         else:
1864    #             final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability']]
1865    #         # final_prob_df = final_prob_df.rename(columns={'job_count': 'count'})
1866
1867    #         # Sort and save
1868    #         final_prob_df = final_prob_df.sort_values(by=['registration', 'quarter', 'six_hour_bin', 'reason']).reset_index(drop=True)
1869    #         final_prob_df.to_csv("distribution_data/ad_hoc_unavailability.csv", index=False)
1870
1871    #         print("Ad-hoc unavailability probability table generated successfully.")
1872
1873    #     except FileNotFoundError:
1874    #         print("Couldn't generate ad-hoc unavailability due to missing file(s). "
1875    #             "Please ensure 'external_data/ad_hoc.csv', "
1876    #             "'actual_data/HEMS_ROTA.csv', and "
1877    #             "'actual_data/callsign_registration_lookup.csv' exist.")
1878    #         pass
1879    #     except Exception as e:
1880    #         print(f"An error occurred during ad-hoc unavailability processing: {e}")
1881    #         pass
1882
1883    def ad_hoc_unavailability(self, period_start, period_end):
1884        """
1885        Calculate aircraft availability and unavailability probabilities based on scheduled rotas and ad-hoc downtime.
1886
1887        Args:
1888            period_start (str/datetime): Start date for analysis period
1889            period_end (str/datetime): End date for analysis period
1890            include_debugging_cols (bool): Whether to include debugging columns in output (currently unused)
1891
1892        Returns:
1893            None (saves results to CSV file)
1894        """
1895        # Load and prepare ad-hoc downtime data
1896        adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online'])
1897        adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']]
1898        # Load rota and callsign lookup data
1899        rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv")
1900        callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv")
1901        # Merge rota with callsign lookup to get registration numbers to allow matching with
1902        # ad-hoc data, which uses registrations
1903        full_rota_df = rota_df.merge(callsign_lookup_df, on="callsign")
1904
1905        # Define the hour bands mapping
1906        HOUR_BANDS = {
1907            '00-05': (0, 6),   # 00:00 to 05:59
1908            '06-11': (6, 12),  # 06:00 to 11:59
1909            '12-17': (12, 18), # 12:00 to 17:59
1910            '18-23': (18, 24)  # 18:00 to 23:59
1911        }
1912
1913        # Create list of 6-hour bins
1914        bins = ['00-05', '06-11', '12-17', '18-23']
1915
1916        def is_summer(date_obj, summer_start_month=4, summer_end_month=9):
1917            """
1918            Determine if a date falls in summer months (April-September).
1919
1920            Args:
1921                date_obj: Date object to check
1922
1923            Returns:
1924                bool: True if date is in summer months
1925            """
1926            return date_obj.month in [i for i in range(summer_start_month, summer_end_month+1)]
1927
1928        def check_month_is_summer(month, summer_start_month=4, summer_end_month=9):
1929            """
1930            Determine if a date falls in summer months (April-September).
1931
1932            Args:
1933                month: Integer month
1934
1935            Returns:
1936                str: 'summer' is in summer months, 'winter' if in winter months
1937            """
1938            return 'summer' if month in [i for i in range(summer_start_month, summer_end_month+1)] else 'winter'
1939
1940        def get_band_for_hour(hour):
1941            """
1942            Return the 6-hour band name for a given hour (0-23).
1943
1944            Args:
1945                hour (int): Hour of day (0-23)
1946
1947            Returns:
1948                str or None: Band name or None if hour is invalid
1949            """
1950            for band_name, (start, end) in HOUR_BANDS.items():
1951                if start <= hour < end:
1952                    return band_name
1953            return None
1954
1955        def calculate_minutes_in_band(shift_start, shift_end, band_start, band_end):
1956            """
1957            Calculate how many minutes of a shift fall within a specific time band.
1958
1959            Args:
1960                shift_start (float): Shift start time in hours (0-23.99)
1961                shift_end (float): Shift end time in hours (0-23.99)
1962                band_start (int): Band start hour
1963                band_end (int): Band end hour
1964
1965            Returns:
1966                int: Minutes of overlap between shift and band
1967            """
1968            # Find the overlap between shift and band
1969            overlap_start = max(shift_start, band_start)
1970            overlap_end = min(shift_end, band_end)
1971
1972            if overlap_start < overlap_end:
1973                return int((overlap_end - overlap_start) * 60)  # Convert to minutes
1974            return 0
1975
1976        # Create date range for analysis period
1977        date_range = pd.date_range(start=period_start,
1978                            end=period_end,
1979                            freq='D')
1980        daily_df = pd.DataFrame({'date': date_range})
1981        daily_df['date'] = pd.to_datetime(daily_df['date'])
1982
1983    # Create MultiIndex from date and bins combinations to get all date/time combinations
1984        multi_index = pd.MultiIndex.from_product(
1985            [daily_df['date'], bins],
1986            names=['date', 'six_hour_bin']
1987            )
1988
1989        # Convert MultiIndex to DataFrame and reset index
1990        daily_df = multi_index.to_frame(index=False).reset_index(drop=True)
1991        # Add quarter information for seasonal analysis
1992        daily_df["quarter"] = daily_df.date.dt.quarter
1993
1994        # Initialize availability columns for each aircraft registration
1995        # Each column will store minutes of scheduled time per time band
1996        for registration in full_rota_df['registration'].unique():
1997            daily_df[registration] = 0 # Initialize with 0 minutes
1998
1999        # Calculate scheduled availability for each date/time band combination
2000        for _, row in daily_df.iterrows():
2001            current_date = row['date']
2002            current_band = row['six_hour_bin']
2003            band_start, band_end = HOUR_BANDS[current_band]
2004
2005            is_current_date_summer = is_summer(current_date)
2006
2007            # Get the row index for updating the dataframe
2008            row_idx = daily_df[(daily_df['date'] == current_date) &
2009                            (daily_df['six_hour_bin'] == current_band)].index[0]
2010
2011            # Iterate through each resource's rota entry
2012            for _, rota_entry in full_rota_df.iterrows():
2013                registration = rota_entry['registration']
2014                # Select appropriate start/end times based on season
2015                start_hour_col = 'summer_start' if is_current_date_summer else 'winter_start'
2016                end_hour_col = 'summer_end' if is_current_date_summer else 'winter_end'
2017                start_hour = rota_entry[start_hour_col]
2018                end_hour = rota_entry[end_hour_col]
2019
2020                total_minutes_for_band = 0
2021
2022                if start_hour < end_hour:
2023                    # Shift within same day
2024                    total_minutes_for_band = calculate_minutes_in_band(
2025                        start_hour, end_hour, band_start, band_end
2026                    )
2027                elif start_hour > end_hour:
2028                    # Shift spans midnight - check both parts
2029
2030                    # Part 1: Today from start_hour to midnight
2031                    if band_end <= 24:  # This band is today
2032                        total_minutes_for_band += calculate_minutes_in_band(
2033                            start_hour, 24, band_start, band_end
2034                        )
2035
2036                    # Part 2: Tomorrow from midnight to end_hour
2037                    # Need to check if this band is for tomorrow
2038                    tomorrow = current_date + pd.Timedelta(days=1)
2039                    tomorrow_rows = daily_df[daily_df['date'] == tomorrow]
2040
2041                    if not tomorrow_rows.empty and current_band in tomorrow_rows['six_hour_bin'].values:
2042                        total_minutes_for_band += calculate_minutes_in_band(
2043                            0, end_hour, band_start, band_end
2044                        )
2045
2046                # Update the scheduled time for this aircraft in this time band
2047                daily_df.loc[row_idx, registration] += total_minutes_for_band
2048
2049        # Aggregate scheduled availability by quarter, time band, and registration
2050        available_time = (
2051            daily_df.melt(id_vars=["date", "six_hour_bin", "quarter"], value_name="rota_time", var_name="registration")
2052            .groupby(["quarter", "six_hour_bin", "registration"])[["rota_time"]]
2053            .sum().reset_index()
2054            )
2055
2056        def calculate_availability_row(row, rota_df, callsign_lookup_df):
2057            """
2058            Calculate downtime overlap with scheduled rota for a single downtime event.
2059
2060            Args:
2061                row: DataFrame row containing downtime information
2062                rota_df: DataFrame with rota schedules
2063                callsign_lookup_df: DataFrame mapping callsigns to registrations
2064
2065            Returns:
2066                dict: Dictionary containing processed availability data
2067            """
2068            # Extract downtime information
2069            registration = row['aircraft'].lower()
2070            downtime_start = pd.to_datetime(row['offline'], utc=True)
2071            downtime_end = pd.to_datetime(row['online'], utc=True)
2072            reason = row.get('reason', None)
2073
2074            # Determine which 6-hour bin this downtime starts in
2075            hour = downtime_start.hour
2076            if 0 <= hour <= 5:
2077                six_hour_bin = '00-05'
2078            elif 6 <= hour <= 11:
2079                six_hour_bin = '06-11'
2080            elif 12 <= hour <= 17:
2081                six_hour_bin = '12-17'
2082            else:
2083                six_hour_bin = '18-23'
2084
2085            quarter = downtime_start.quarter
2086
2087            # Find the callsign for this registration
2088            match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
2089
2090            # No matching callsign found
2091            if match.empty:
2092                return {
2093                    'registration': registration,
2094                    'offline': downtime_start,
2095                    'online': downtime_end,
2096                    'six_hour_bin': six_hour_bin,
2097                    'quarter': quarter,
2098                    'total_offline': None,
2099                    'reason': reason
2100                }
2101
2102            callsign = match.iloc[0]['callsign']
2103            # Find rota entries for this callsign
2104            rota_rows = rota_df[rota_df['callsign'] == callsign]
2105            if rota_rows.empty:
2106                return {
2107                    'registration': registration,
2108                    'offline': downtime_start,
2109                    'online': downtime_end,
2110                    'six_hour_bin': six_hour_bin,
2111                    'quarter': quarter,
2112                    'total_offline': None,
2113                    'reason': reason
2114                }
2115
2116            # Determine season for appropriate rota times
2117            month = downtime_start.month
2118            season = check_month_is_summer(month)
2119
2120            total_overlap_minutes = 0
2121
2122            # Calculate overlap between downtime and scheduled rota times
2123            for _, rota in rota_rows.iterrows():
2124                start_hour = rota[f'{season}_start']
2125                end_hour = rota[f'{season}_end']
2126
2127                # Check overlap across multiple days (yesterday, today, tomorrow)
2128                # This handles shifts that span midnight
2129                for base_day in [downtime_start.normalize() - timedelta(days=1),
2130                                downtime_start.normalize(),
2131                                downtime_start.normalize() + timedelta(days=1)]:
2132
2133                    rota_start = base_day + timedelta(hours=start_hour)
2134                    rota_end = base_day + timedelta(hours=end_hour)
2135
2136                    # Handle shifts that cross midnight
2137                    if end_hour <= start_hour:
2138                        rota_end += timedelta(days=1)
2139                    # Calculate overlap between downtime and this rota period
2140                    overlap_start = max(downtime_start, rota_start)
2141                    overlap_end = min(downtime_end, rota_end)
2142
2143                    if overlap_end > overlap_start:
2144                        overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
2145
2146                        total_overlap_minutes += overlap_minutes
2147
2148            return {
2149                'registration': registration,
2150                'offline': downtime_start,
2151                'online': downtime_end,
2152                'six_hour_bin': six_hour_bin,
2153                'quarter': quarter,
2154                'total_offline': total_overlap_minutes,
2155                'reason': reason
2156            }
2157
2158        # Process all ad-hoc downtime events
2159        results = adhoc_df.apply(
2160                        lambda row: calculate_availability_row(
2161                            row, rota_df, callsign_lookup_df),
2162                        axis=1
2163                    )
2164
2165        # Convert results to DataFrame and select relevant columns
2166        unavailability_minutes_df = (
2167            pd.DataFrame(results.tolist())
2168            [["registration", "six_hour_bin", "quarter", "total_offline", "reason"]]
2169            )
2170
2171        # Aggregate offline time by registration, time band, quarter, and reason
2172        offline_durations = unavailability_minutes_df.groupby(
2173            ["registration", "six_hour_bin", "quarter", "reason"]
2174            )[["total_offline"]].sum().reset_index()
2175
2176        # Create complete combinations to ensure all possible categories are represented
2177        # This prevents missing data issues in the final output
2178        registrations = offline_durations['registration'].unique()
2179        six_hour_bins = offline_durations['six_hour_bin'].unique()
2180        quarters = offline_durations['quarter'].unique()
2181        reasons = offline_durations['reason'].unique()
2182
2183        # Generate all possible combinations
2184        all_combinations = list(itertools.product(registrations, six_hour_bins, quarters, reasons))
2185
2186        # Create complete dataframe with all combinations
2187        complete_df = pd.DataFrame(
2188            all_combinations,
2189            columns=['registration', 'six_hour_bin', 'quarter', 'reason']
2190            )
2191
2192        # Merge with original data to preserve existing values, fill missing with 0
2193        offline_durations = complete_df.merge(
2194            offline_durations,
2195            on=['registration', 'six_hour_bin', 'quarter', 'reason'],
2196            how='left'
2197            )
2198
2199        # Fill NaN values with 0 (no downtime for those combinations)
2200        offline_durations['total_offline'] = offline_durations['total_offline'].fillna(0.0)
2201
2202        # Sort for better readability
2203        offline_durations = offline_durations.sort_values(
2204            ['registration', 'six_hour_bin', 'quarter', 'reason']
2205            ).reset_index(drop=True)
2206
2207        # Ensure consistent case for registration names
2208        available_time["registration"] = available_time["registration"].str.lower()
2209        offline_durations["registration"] = offline_durations["registration"].str.lower()
2210
2211        # Merge scheduled time with downtime data
2212        ad_hoc = available_time.merge(offline_durations, on=["registration", "quarter", "six_hour_bin"])
2213        ad_hoc["probability"] = ad_hoc["total_offline"] / ad_hoc["rota_time"]
2214
2215        # Calculate availability probability (1 - sum of all unavailability probabilities)
2216        available_prop_df = ad_hoc.groupby(
2217            ["quarter", "six_hour_bin", "registration", "rota_time"]
2218            )[["probability"]].sum().reset_index()
2219
2220        available_prop_df["reason"] = "available"
2221        available_prop_df["probability"] = 1 - available_prop_df["probability"]
2222
2223        # Combine unavailability and availability data
2224        final_ad_hoc_df = (
2225            pd.concat([ad_hoc, available_prop_df])
2226            .sort_values(["quarter", "six_hour_bin", "registration", "reason"])
2227            )
2228
2229        # Handle cases where there's no scheduled time (set probability to NaN)
2230        final_ad_hoc_df["probability"] = final_ad_hoc_df.apply(
2231            lambda x: np.nan if x["rota_time"]==0 else x["probability"],
2232            axis=1
2233            )
2234
2235        final_ad_hoc_df["probability"] = final_ad_hoc_df["probability"].round(5)
2236
2237        # Save results to CSV
2238        final_ad_hoc_df[["registration", "six_hour_bin", "quarter", "reason", "probability"]].to_csv(
2239            "distribution_data/ad_hoc_unavailability.csv", index=False
2240            )
2241
2242
2243    def run_sim_on_historical_params(self):
2244        # Ensure all rotas are using default values
2245        rota = pd.read_csv("tests/rotas_historic/HISTORIC_HEMS_ROTA.csv")
2246        rota.to_csv("actual_data/HEMS_ROTA.csv", index=False)
2247
2248        callsign_reg_lookup = pd.read_csv("tests/rotas_historic/HISTORIC_callsign_registration_lookup.csv")
2249        callsign_reg_lookup.to_csv("actual_data/callsign_registration_lookup.csv", index=False)
2250
2251        service_history = pd.read_csv("tests/rotas_historic/HISTORIC_service_history.csv")
2252        service_history.to_csv("actual_data/service_history.csv", index=False)
2253
2254        service_sched = pd.read_csv("tests/rotas_historic/HISTORIC_service_schedules_by_model.csv")
2255        service_sched.to_csv("actual_data/service_schedules_by_model.csv", index=False)
2256
2257        print("Generating simulation results...")
2258        removeExistingResults()
2259
2260        total_runs = 30
2261        sim_years = 2
2262        sim_duration = 60 * 24 * 7 * 52 * sim_years
2263
2264        parallelProcessJoblib(
2265            total_runs=total_runs,
2266            sim_duration=sim_duration,
2267            warm_up_time=0,
2268            sim_start_date=datetime.strptime("2023-01-01 05:00:00", "%Y-%m-%d %H:%M:%S"),
2269            amb_data=False,
2270            print_debug_messages=True
2271        )
2272
2273        collateRunResults()
2274
2275        try:
2276            results_all_runs = pd.read_csv("data/run_results.csv")
2277            # results_all_runs.to_csv("historical_data/calculated/SIM_hist_params.csv", index=False)
2278
2279            # save data of counts of suboptimal care category sent
2280            counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", 'hems_res_category', "care_cat"]].value_counts().reset_index()
2281
2282            counts_df_summary = counts_df.groupby(["hems_res_category", "care_cat"])["count"].agg(["mean", "min", "max"]).reset_index()
2283
2284            counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_care_cat_sent_summary.csv")
2285
2286            # save data of counts of suboptimal vehicle type sent
2287            counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", "vehicle_type", "heli_benefit"]].value_counts().reset_index()
2288
2289            counts_df_summary = counts_df.groupby(["vehicle_type", "heli_benefit"])["count"].agg(["mean", "min", "max"]).reset_index()
2290
2291            counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_vehicle_type_sent_summary.csv")
2292
2293
2294            # # Also run the model to get some base-case outputs
2295            # resource_requests = (
2296            #     results_all_runs[results_all_runs["event_type"] == "resource_request_outcome"]
2297            #     .copy()
2298            #     )
2299
2300            # resource_requests["care_cat"] = (
2301            #     resource_requests.apply(lambda x: "REG - Helicopter Benefit" if x["heli_benefit"]=="y"
2302            #                             and x["care_cat"]=="REG" else x["care_cat"],
2303            #                             axis=1)
2304            #                             )
2305
2306            # missed_jobs_care_cat_summary = (
2307            #     resource_requests[["care_cat", "time_type"]].value_counts().reset_index(name="jobs")
2308            #     .sort_values(["care_cat", "time_type"])
2309            #     .copy()
2310            #     )
2311
2312            # missed_jobs_care_cat_summary["jobs_average"] = (
2313            #     missed_jobs_care_cat_summary["jobs"]/
2314            #     total_runs
2315            #     )
2316
2317            # missed_jobs_care_cat_summary["jobs_per_year_average"] = (
2318            #     (missed_jobs_care_cat_summary["jobs_average"] / float(sim_years*365)*365)
2319            #     ).round(0)
2320
2321            missed_jobs_care_cat_summary = _job_outcome_calculation.get_missed_call_df(
2322                    results_all_runs=results_all_runs,
2323                    run_length_days = float(sim_years*365),
2324                    what="summary"
2325                    )
2326
2327            missed_jobs_care_cat_summary.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_summary.csv")
2328
2329
2330            missed_jobs_care_cat_breakdown = _job_outcome_calculation.get_missed_call_df(
2331                    results_all_runs=results_all_runs,
2332                    run_length_days = float(sim_years*365),
2333                    what="breakdown"
2334                    )
2335
2336            missed_jobs_care_cat_breakdown.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_breakdown.csv")
2337
2338        except FileNotFoundError:
2339            pass

The DistributionFitUtils classa

This class will import a CSV, undertake some light wrangling and then determine distributions and probabilities required for the Discrete Event Simulation

example usage: my_data = DistributionFitUtils('data/my_data.csv') my_data.import_and_wrangle()

DistributionFitUtils( file_path: str, calculate_school_holidays=False, school_holidays_years=0)
36    def __init__(self, file_path: str, calculate_school_holidays = False, school_holidays_years = 0):
37
38        self.file_path = file_path
39        self.df = pd.DataFrame()
40
41        # The number of additional years of school holidays
42        # that will be calculated over that maximum date in the provided dataset
43        self.school_holidays_years = school_holidays_years
44        self.calculate_school_holidays = calculate_school_holidays
45
46        self.times_to_fit = [
47            {"hems_result": "Patient Treated but not conveyed by HEMS",
48            "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']},
49            {"hems_result": "Patient Conveyed by HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']},
50            {"hems_result": "Patient Conveyed by land with HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']},
51            {"hems_result": "Stand Down" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_clear']},
52            {"hems_result": "Landed but no patient contact" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']},
53        ]
54
55        self.sim_tools_distr_plus = [
56            "poisson",
57            "bernoulli",
58            "triang",
59            "erlang",
60            "weibull_min",
61            "expon_weib",
62            "betabinom",
63            "pearson3",
64            "cauchy",
65            "chi2",
66            "expon",
67            "exponpow",
68            "gamma",
69            "lognorm",
70            "norm",
71            "powerlaw",
72            "rayleigh",
73            "uniform",
74            "neg_binomial",
75            "zip"
76        ]
77        # SR 16-04-2025 Have hardcoded the common distributions
78        # to make setup for random number generation more robust
79        #+ get_common_distributions()
file_path
df
school_holidays_years
calculate_school_holidays
times_to_fit
sim_tools_distr_plus
def removeExistingResults(self, folder: str) -> None:
81    def removeExistingResults(self, folder: str) -> None:
82            """
83                Removes results from previous fitting
84            """
85
86            matching_files = glob.glob(os.path.join(folder, "*.*"))
87
88            print(matching_files)
89
90            for file in matching_files:
91                os.remove(file)

Removes results from previous fitting

def getBestFit( self, q_times, distr=['cauchy', 'chi2', 'expon', 'exponpow', 'gamma', 'lognorm', 'norm', 'powerlaw', 'rayleigh', 'uniform'], show_summary=False):
 94    def getBestFit(self, q_times, distr=get_common_distributions(), show_summary=False):
 95        """
 96
 97            Convenience function for Fitter.
 98            Returns model and parameters that is considered
 99            the 'best fit'.
100
101            TODO: Determine how Fitter works this out
102
103        """
104
105        if(q_times.size > 0):
106            if(len(distr) > 0):
107                f = Fitter(q_times, timeout=60, distributions=distr)
108            else:
109                f = Fitter(q_times, timeout=60)
110            f.fit()
111            if show_summary == True:
112                f.summary()
113            return f.get_best()
114        else:
115            return {}

Convenience function for Fitter. Returns model and parameters that is considered the 'best fit'.

TODO: Determine how Fitter works this out

def import_and_wrangle(self):
118    def import_and_wrangle(self):
119        """
120
121            Function to import CSV, add additional columns that are required
122            and then sequentially execute other class functions to generate
123            the probabilities and distributions required for the DES.
124
125            TODO: Additional logic is required to check the imported CSV
126            for missing values, incorrect columns names etc.
127
128        """
129
130        try:
131            df = pd.read_csv(self.file_path, quoting=QUOTE_ALL)
132            self.df = df
133
134            # Perhaps run some kind of checking function here.
135
136        except FileNotFoundError:
137            print(f"Cannot locate that file")
138
139        # If everything is okay, crack on...
140        self.df['inc_date'] = pd.to_datetime(self.df['inc_date'])
141        self.df['date_only'] = pd.to_datetime(df['inc_date'].dt.date)
142        self.df['hour'] = self.df['inc_date'].dt.hour                      # Hour of the day
143        self.df['day_of_week'] = self.df['inc_date'].dt.day_name()         # Day of the week (e.g., Monday)
144        self.df['month'] = self.df['inc_date'].dt.month
145        self.df['quarter'] = self.df['inc_date'].dt.quarter
146        self.df['first_day_of_month'] = self.df['inc_date'].to_numpy().astype('datetime64[M]')
147
148        # Replacing a upper quartile limit on job cycle times and
149        # instead using a manually specified time frame.
150        # This has the advantage of allowing manual amendment of the falues
151        # on the front-end
152        # self.max_values_df = self.upper_allowable_time_bounds()
153        self.min_max_values_df = pd.read_csv('actual_data/upper_allowable_time_bounds.csv')
154        #print(self.min_max_values_df)
155
156        #This will be needed for other datasets, but has already been computed for DAA
157        #self.df['ampds_card'] = self.df['ampds_code'].str[:2]
158
159        self.removeExistingResults(Utils.HISTORICAL_FOLDER)
160        self.removeExistingResults(Utils.DISTRIBUTION_FOLDER)
161
162        #get proportions of AMPDS card by hour of day
163        self.hour_by_ampds_card_probs()
164
165        # Determine 'best' distributions for time-based data
166        self.activity_time_distributions()
167
168        # Calculate probability patient will be female based on AMPDS card
169        self.sex_by_ampds_card_probs()
170
171        # Determine 'best' distributions for age ranges straitifed by AMPDS card
172        self.age_distributions()
173
174        # Alternative approach to IA times. Start with probabilty of call at given hour stratified by quarter
175        self.hourly_arrival_by_qtr_probs()
176
177        # Calculates the mean and standard deviation of the number of incidents per day stratified by quarter
178        self.incidents_per_day()
179        self.incidents_per_day_samples()
180
181        # Calculate probability of enhanced or critical care being required based on AMPDS card
182        self.enhanced_or_critical_care_by_ampds_card_probs()
183
184        # Calculate HEMS result
185        self.hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs()
186
187        # Calculate probability of callsign being allocated to a job based on AMPDS card and hour of day
188        # self.callsign_group_by_ampds_card_and_hour_probs()
189        # self.callsign_group_by_ampds_card_probs()
190        # self.callsign_group_by_care_category()
191        self.callsign_group()
192
193        # Calculate probability of a particular vehicle type based on callsign group and month of year
194        # self.vehicle_type_by_month_probs()
195        self.vehicle_type_by_quarter_probs()
196        # self.vehicle_type_probs() # Similar to previous but without monthly stratification since ad hoc unavailability should account for this.
197
198        # Calculate the patient outcome (conveyed, deceased, unknown)
199        self.patient_outcome_by_care_category_and_quarter_probs()
200
201        # ============= ARCHIVED CODE ================= #
202        # Calculate the mean inter-arrival times stratified by yearly quarter and hour of day
203        # self.inter_arrival_times()
204        # ============= END ARCHIVED CODE ================= #
205
206
207        # ============= ARCHIVED CODE ================= #
208        # Calculate probably of patient outcome
209        # Note - this still needs to be run to support another one?
210        # Calculate probability of a specific patient outcome being allocated to a job based on HEMS result and callsign
211        # self.pt_outcome_by_hems_result_and_care_category_probs()
212        # ============= END ARCHIVED CODE ================= #
213
214        # ============= ARCHIVED CODE ================= #
215        # self.hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs()
216        # ============= END ARCHIVED CODE ================= #
217
218        # ============= ARCHIVED CODE ================= #
219        # Calculate probabily of HEMS result being allocated to a job based on callsign and hour of day
220        # self.hems_result_by_callsign_group_and_vehicle_type_probs()
221        # ============= END ARCHIVED CODE ================= #
222
223        # ============= ARCHIVED CODE ================= #
224        # Calculate probability of HEMS result being allocated to a job based on care category and helicopter benefit
225        # self.hems_result_by_care_cat_and_helicopter_benefit_probs()
226        # ============= END ARCHIVED CODE ================= #
227
228
229        # Calculate school holidays since servicing schedules typically avoid these dates
230        if self.calculate_school_holidays:
231            self.school_holidays()
232
233        # Calculate historical data
234        self.historical_monthly_totals()
235        self.historical_monthly_totals_by_callsign()
236        self.historical_monthly_totals_by_day_of_week()
237        self.historical_median_time_of_activities_by_month_and_resource_type()
238        self.historical_monthly_totals_by_hour_of_day()
239        self.historical_monthly_resource_utilisation()
240        self.historical_monthly_totals_all_calls()
241        self.historical_daily_calls_breakdown()
242        self.historical_job_durations_breakdown()
243        self.historical_missed_jobs()
244        self.historical_jobs_per_day_per_callsign()
245        self.historical_care_cat_counts()
246
247        # Calculate proportions of ad hoc unavailability
248        try:
249            # self.ad_hoc_unavailability()
250            self.ad_hoc_unavailability(period_start="2022-08-01", period_end="2024-07-31")
251        except FileNotFoundError:
252            print("Couldn't find ad-hoc unavailability file")

Function to import CSV, add additional columns that are required and then sequentially execute other class functions to generate the probabilities and distributions required for the DES.

TODO: Additional logic is required to check the imported CSV for missing values, incorrect columns names etc.

def hour_by_ampds_card_probs(self):
254    def hour_by_ampds_card_probs(self):
255        """
256
257            Calculates the proportions of calls that are triaged with
258            a specific AMPDS card. This is stratified by hour of day
259
260            TODO: Determine whether this should also be stratified by yearly quarter
261
262        """
263        category_counts = self.df.groupby(['hour', 'ampds_card']).size().reset_index(name='count')
264        total_counts = category_counts.groupby('hour')['count'].transform('sum')
265        category_counts['proportion'] = round(category_counts['count'] / total_counts, 4)
266
267        #category_counts['ampds_card'] = category_counts['ampds_card'].apply(lambda x: str(x).zfill(2))
268
269        category_counts.to_csv('distribution_data/hour_by_ampds_card_probs.csv', mode="w+")

Calculates the proportions of calls that are triaged with a specific AMPDS card. This is stratified by hour of day

TODO: Determine whether this should also be stratified by yearly quarter

def sex_by_ampds_card_probs(self):
272    def sex_by_ampds_card_probs(self):
273        """
274
275            Calculates the probability that the patient will be female
276            stratified by AMPDS card.
277
278        """
279        age_df = self.df
280        category_counts = age_df.groupby(['ampds_card', 'sex']).size().reset_index(name='count')
281        total_counts = category_counts.groupby('ampds_card')['count'].transform('sum')
282        category_counts['proportion'] = round(category_counts['count'] / total_counts, 3)
283
284        category_counts[category_counts['sex'] =='Female'].to_csv('distribution_data/sex_by_ampds_card_probs.csv', mode="w+")

Calculates the probability that the patient will be female stratified by AMPDS card.

def activity_time_distributions(self):
286    def activity_time_distributions(self):
287        """
288
289            Determine the 'best' distribution for each phase of a call
290            i.e. Allocation time, Mobilisation time, Time to scene
291            Time on scene, Travel time to hospital and handover, Time to clear.
292            Not all times will apply to all cases, so the class 'times_to_fit'
293            variable is a list of dictionaries, which contains the times to fit
294
295            The data is currently stratitied by HEMS_result and vehicle type fields.
296
297        """
298
299        vehicle_type = self.df['vehicle_type'].dropna().unique()
300
301        # We'll need to make sure that where a distribution is missing that the time is set to 0 in the model.
302        # Probably easier than complicated logic to determine what times should be available based on hems_result
303
304        final_distr = []
305
306        for row in self.times_to_fit:
307            #print(row)
308            for ttf in row['times_to_fit']:
309                for vt in vehicle_type:
310                    #print(f"HEMS result is {row['hems_result']} times_to_fit is {ttf} and vehicle type is  {vt}")
311
312                    # This line might not be required if data quality is determined when importing the data
313                    max_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].max_value_mins.iloc[0]
314                    min_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].min_value_mins.iloc[0]
315
316                    #print(f"Max time is {max_time} and Min time is {min_time}")
317
318                    if ttf == 'time_on_scene':
319                        # There is virtually no data for HEMS_result other than patient conveyed
320                        # which is causing issues with fitting. For time on scene, will
321                        # use a simplified fitting ignoring hems_result as a category
322                        fit_times = self.df[
323                            (self.df.vehicle_type == vt) &
324                            (self.df[ttf] >= min_time) &
325                            (self.df[ttf] <= max_time)
326                        ][ttf]
327                    else:
328                        fit_times = self.df[
329                            (self.df.vehicle_type == vt) &
330                            (self.df[ttf] >= min_time) &
331                            (self.df[ttf] <= max_time) &
332                            (self.df.hems_result == row['hems_result'])
333                        ][ttf]
334                    #print(fit_times[:10])
335                    best_fit = self.getBestFit(fit_times, distr=self.sim_tools_distr_plus)
336                    #print(best_fit)
337
338                    return_dict = { "vehicle_type": vt, "time_type" : ttf, "best_fit": best_fit, "hems_result": row['hems_result'], "n": len(fit_times)}
339                    #print(return_dict)
340                    final_distr.append(return_dict)
341
342        with open('distribution_data/activity_time_distributions.txt', 'w+') as convert_file:
343            convert_file.write(json.dumps(final_distr))
344        convert_file.close()

Determine the 'best' distribution for each phase of a call i.e. Allocation time, Mobilisation time, Time to scene Time on scene, Travel time to hospital and handover, Time to clear. Not all times will apply to all cases, so the class 'times_to_fit' variable is a list of dictionaries, which contains the times to fit

The data is currently stratitied by HEMS_result and vehicle type fields.

def age_distributions(self):
347    def age_distributions(self):
348        """
349
350            Determine the 'best' distribution for age stratified by
351            AMPDS card
352
353        """
354
355        age_distr = []
356
357        age_df = self.df[["age", "ampds_card"]].dropna()
358        ampds_cards = age_df['ampds_card'].unique()
359        print(ampds_cards)
360
361        for card in ampds_cards:
362            fit_ages = age_df[age_df['ampds_card'] == card]['age']
363            best_fit = self.getBestFit(fit_ages, distr=self.sim_tools_distr_plus)
364            return_dict = { "ampds_card": str(card), "best_fit": best_fit, "n": len(fit_ages)}
365            age_distr.append(return_dict)
366
367        with open('distribution_data/age_distributions.txt', 'w+') as convert_file:
368            convert_file.write(json.dumps(age_distr))
369        convert_file.close()

Determine the 'best' distribution for age stratified by AMPDS card

def incidents_per_day(self):
403    def incidents_per_day(self):
404        """
405        Fit distributions for number of incidents per day using actual daily counts,
406        applying year-based weighting to reflect trends (e.g., 2024 busier than 2023),
407        stratified by season and quarter.
408        """
409        import math
410        import json
411        import numpy as np
412
413        inc_df = self.df[['inc_date', 'date_only', 'quarter']].copy()
414        inc_df['year'] = inc_df['date_only'].dt.year
415
416        # Daily incident counts
417        inc_per_day = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day')
418        inc_per_day['year'] = inc_per_day['date_only'].dt.year
419
420        # Merge quarter and season from self.df
421        date_info = self.df[['date_only', 'quarter']].drop_duplicates()
422
423        if 'season' not in self.df.columns:
424            date_info['season'] = date_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
425        else:
426            date_info = date_info.merge(
427                self.df[['date_only', 'season']].drop_duplicates(),
428                on='date_only',
429                how='left'
430            )
431
432        inc_per_day = inc_per_day.merge(date_info, on='date_only', how='left')
433
434        # Weight settings - simple implementation rather than biased mean thing
435        year_weights = {
436            2023: 1.0,
437            2024: 4.0  # 10% more weight to 2024
438        }
439
440        # ========== SEASONAL DISTRIBUTIONS ==========
441        jpd_distr = []
442
443        for season in inc_per_day['season'].dropna().unique():
444            filtered = inc_per_day[inc_per_day['season'] == season].copy()
445            filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0)
446
447            # Repeat rows proportionally by weight
448            replicated = filtered.loc[
449                filtered.index.repeat((filtered['weight'] * 10).round().astype(int))
450            ]['jobs_per_day']
451
452            best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus)
453
454            jpd_distr.append({
455                "season": season,
456                "best_fit": best_fit,
457                "min_n_per_day": int(replicated.min()),
458                "max_n_per_day": int(replicated.max()),
459                "mean_n_per_day": float(replicated.mean())
460            })
461
462        with open('distribution_data/inc_per_day_distributions.txt', 'w+') as f:
463            json.dump(jpd_distr, f)
464
465        # ========== QUARTERLY DISTRIBUTIONS ==========
466        jpd_qtr_distr = []
467
468        for quarter in inc_per_day['quarter'].dropna().unique():
469            filtered = inc_per_day[inc_per_day['quarter'] == quarter].copy()
470            filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0)
471
472            replicated = filtered.loc[
473                filtered.index.repeat((filtered['weight'] * 10).round().astype(int))
474            ]['jobs_per_day']
475
476            best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus)
477
478            jpd_qtr_distr.append({
479                "quarter": int(quarter),
480                "best_fit": best_fit,
481                "min_n_per_day": int(replicated.min()),
482                "max_n_per_day": int(replicated.max()),
483                "mean_n_per_day": float(replicated.mean())
484            })
485
486        with open('distribution_data/inc_per_day_qtr_distributions.txt', 'w+') as f:
487            json.dump(jpd_qtr_distr, f)

Fit distributions for number of incidents per day using actual daily counts, applying year-based weighting to reflect trends (e.g., 2024 busier than 2023), stratified by season and quarter.

def incidents_per_day_samples(self, weight_map=None, scale_factor=10):
489    def incidents_per_day_samples(self, weight_map=None, scale_factor=10):
490        """
491            Create weighted empirical samples of incidents per day by season and quarter.
492        """
493
494        inc_df = self.df[['date_only', 'quarter']].copy()
495        inc_df['year'] = inc_df['date_only'].dt.year
496        inc_df['season'] = inc_df['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
497
498        # Get raw counts per day
499        daily_counts = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day')
500        daily_counts['year'] = daily_counts['date_only'].dt.year
501
502        # Merge back in season/quarter info
503        meta_info = self.df[['date_only', 'quarter']].drop_duplicates()
504        if 'season' in self.df.columns:
505            meta_info = meta_info.merge(
506                self.df[['date_only', 'season']].drop_duplicates(),
507                on='date_only', how='left'
508            )
509        else:
510            meta_info['season'] = meta_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer")
511
512        daily_counts = daily_counts.merge(meta_info, on='date_only', how='left')
513
514        # Year weight map
515        if weight_map is None:
516            weight_map = {2023: 1.0, 2024: 1.1}
517
518        # Compute weights
519        daily_counts['weight'] = daily_counts['year'].map(weight_map).fillna(1.0)
520
521        # Storage
522        empirical_samples = {}
523
524        # Season-based
525        for season in daily_counts['season'].dropna().unique():
526            filtered = daily_counts[daily_counts['season'] == season].copy()
527            repeated = filtered.loc[
528                filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int))
529            ]['jobs_per_day'].tolist()
530
531            empirical_samples[season] = repeated
532
533        # Quarter-based
534        for quarter in daily_counts['quarter'].dropna().unique():
535            filtered = daily_counts[daily_counts['quarter'] == quarter].copy()
536            repeated = filtered.loc[
537                filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int))
538            ]['jobs_per_day'].tolist()
539
540            empirical_samples[f"Q{int(quarter)}"] = repeated
541
542        with open("distribution_data/inc_per_day_samples.json", 'w') as f:
543            json.dump(empirical_samples, f)

Create weighted empirical samples of incidents per day by season and quarter.

def enhanced_or_critical_care_by_ampds_card_probs(self):
546    def enhanced_or_critical_care_by_ampds_card_probs(self):
547        """
548
549            Calculates the probabilty of enhanced or critical care resource beign required
550            based on the AMPDS card
551
552        """
553
554        ec_df = self.df[['ampds_card', 'ec_benefit', 'cc_benefit']].copy()
555
556        def assign_care_category(row):
557            # There are some columns with both EC and CC benefit selected
558            # this function will allocate to only 1
559            if row['cc_benefit'] == 'y':
560                return 'CC'
561            elif row['ec_benefit'] == 'y':
562                return 'EC'
563            else:
564                return 'REG'
565
566        ec_df['care_category'] = ec_df.apply(assign_care_category, axis = 1)
567
568        care_cat_counts = ec_df.groupby(['ampds_card', 'care_category']).size().reset_index(name='count')
569        total_counts = care_cat_counts.groupby('ampds_card')['count'].transform('sum')
570
571        care_cat_counts['proportion'] = round(care_cat_counts['count'] / total_counts, 3)
572
573        care_cat_counts.to_csv('distribution_data/enhanced_or_critical_care_by_ampds_card_probs.csv', mode = "w+", index = False)

Calculates the probabilty of enhanced or critical care resource beign required based on the AMPDS card

def patient_outcome_by_care_category_and_quarter_probs(self):
575    def patient_outcome_by_care_category_and_quarter_probs(self):
576        """
577
578            Calculates the probabilty of a patient outcome based on care category and yearly quarter
579
580        """
581
582        po_df = self.df[['quarter', 'ec_benefit', 'cc_benefit', 'pt_outcome']].copy()
583
584        def assign_care_category(row):
585            # There are some columns with both EC and CC benefit selected
586            # this function will allocate to only 1
587            if row['cc_benefit'] == 'y':
588                return 'CC'
589            elif row['ec_benefit'] == 'y':
590                return 'EC'
591            else:
592                return 'REG'
593
594        # There are some values that are missing e.g. CC quarter 1 Deceased
595        # I think we've had problems when trying to sample from this kind of thing before
596        # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
597        outcomes = po_df['pt_outcome'].unique()
598        care_categories = ['CC', 'EC', 'REG']
599        quarters = po_df['quarter'].unique()
600
601        all_combinations = pd.DataFrame(list(itertools.product(outcomes, care_categories, quarters)),
602                                    columns=['pt_outcome', 'care_category', 'quarter'])
603
604        po_df['care_category'] = po_df.apply(assign_care_category, axis = 1)
605
606        po_cat_counts = po_df.groupby(['pt_outcome', 'care_category', 'quarter']).size().reset_index(name='count')
607
608        merged = pd.merge(all_combinations, po_cat_counts,
609                      on=['pt_outcome', 'care_category', 'quarter'],
610                      how='left').fillna({'count': 0})
611        merged['count'] = merged['count'].astype(int)
612
613        total_counts = merged.groupby(['care_category', 'quarter'])['count'].transform('sum')
614        merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
615
616        merged.to_csv('distribution_data/patient_outcome_by_care_category_and_quarter_probs.csv', mode = "w+", index = False)

Calculates the probabilty of a patient outcome based on care category and yearly quarter

def hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs(self):
654    def hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs(self):
655        """
656
657            Calculates the probabilty of a given HEMS result based on
658                - patient outcome
659                - yearly quarter
660                - time of day (7am - 6pm, 7pm - 6am)
661                - vehicle type
662                - and callsign group
663
664        """
665        self.df['inc_date'] = pd.to_datetime(self.df['inc_date'])
666        self.df['hour'] = self.df['inc_date'].dt.hour
667        self.df['time_of_day'] = self.df['hour'].apply(lambda x: 'day' if x >= 7 and x <= 18 else "night")
668
669        hr_df = self.df[[
670            'hems_result', 'quarter', 'pt_outcome',
671            'vehicle_type', 'callsign_group', 'time_of_day'
672            ]].copy()
673
674        # There are some values that are missing e.g. CC quarter 1 Deceased
675        # I think we've had problems when trying to sample from this kind of thing before
676        # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0
677        # hems_results = hr_df['hems_result'].unique()
678        # outcomes = hr_df['pt_outcome'].unique()
679        # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)]
680        # callsign_group_categories = hr_df['callsign_group'].unique()
681        # quarters = hr_df['quarter'].unique()
682
683        # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)),
684        #                             columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])
685
686        hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome',
687                                       'vehicle_type', 'callsign_group',
688                                       'quarter', 'time_of_day']).size().reset_index(name='count')
689
690        # merged = pd.merge(all_combinations, hr_cat_counts,
691        #               on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'],
692        #               how='left').fillna({'count': 0})
693        # merged['count'] = merged['count'].astype(int)
694
695        merged = hr_cat_counts
696
697        total_counts = merged.groupby(['pt_outcome',
698                                       'vehicle_type', 'callsign_group',
699                                       'quarter', 'time_of_day'])['count'].transform('sum')
700        merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3)
701
702        merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False)

Calculates the probabilty of a given HEMS result based on - patient outcome - yearly quarter - time of day (7am - 6pm, 7pm - 6am) - vehicle type - and callsign group

def hourly_arrival_by_qtr_probs(self):
706    def hourly_arrival_by_qtr_probs(self):
707        """
708
709            Calculates the proportions of calls arriving in any given hour
710            stratified by yearly quarter
711
712        """
713
714        ia_df = self.df[['quarter', 'hour']].dropna()
715
716        hourly_counts = ia_df.groupby(['hour', 'quarter']).size().reset_index(name='count')
717        total_counts = hourly_counts.groupby(['quarter'])['count'].transform('sum')
718        hourly_counts['proportion'] = round(hourly_counts['count'] / total_counts, 4)
719
720        hourly_counts.sort_values(by=['quarter', 'hour']).to_csv('distribution_data/hourly_arrival_by_qtr_probs.csv', mode="w+")

Calculates the proportions of calls arriving in any given hour stratified by yearly quarter

def callsign_group_by_ampds_card_and_hour_probs(self):
723    def callsign_group_by_ampds_card_and_hour_probs(self):
724        """
725
726            Calculates the probabilty of a specific callsign being allocated to
727            a call based on the AMPDS card category and hour of day
728
729        """
730        callsign_counts = self.df.groupby(['ampds_card', 'hour', 'callsign_group']).size().reset_index(name='count')
731
732        total_counts = callsign_counts.groupby(['ampds_card', 'hour'])['count'].transform('sum')
733        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
734
735        callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_and_hour_probs.csv', mode = "w+", index=False)

Calculates the probabilty of a specific callsign being allocated to a call based on the AMPDS card category and hour of day

def callsign_group_by_ampds_card_probs(self):
737    def callsign_group_by_ampds_card_probs(self):
738        """
739
740            Calculates the probabilty of a specific callsign being allocated to
741            a call based on the AMPDS card category
742
743        """
744
745        callsign_df = self.df[self.df['callsign_group'] != 'Other']
746
747        callsign_counts = callsign_df.groupby(['ampds_card', 'callsign_group']).size().reset_index(name='count')
748
749        total_counts = callsign_counts.groupby(['ampds_card'])['count'].transform('sum')
750        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
751
752        callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_probs.csv', mode = "w+", index=False)

Calculates the probabilty of a specific callsign being allocated to a call based on the AMPDS card category

def callsign_group(self):
754    def callsign_group(self):
755        """
756            Calculates the probabilty of a specific callsign being allocated to
757            a call
758        """
759        df = self.df.copy()
760
761        # Convert time fields to numeric
762        time_fields = [
763            "time_allocation", "time_mobile", "time_to_scene",
764            "time_on_scene", "time_to_hospital", "time_to_clear"
765        ]
766        for col in time_fields:
767            df[col] = pd.to_numeric(df[col], errors="coerce")
768
769        # Calculate total job duration in minutes
770        df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True)
771
772        # Compute job start and end times
773        df["start_time"] = df["inc_date"]
774        df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m")
775
776        # Sort jobs by start time
777        df = df.sort_values(by="start_time").reset_index(drop=True)
778
779        # Set to hold indices of jobs that overlap (but only the later-starting ones)
780        overlapping = set()
781
782        # Check for overlaps
783        for i in range(len(df)):
784            this_end = df.at[i, "end_time"]
785
786            # Compare only to later jobs
787            for j in range(i + 1, len(df)):
788                next_start = df.at[j, "start_time"]
789                if next_start >= this_end:
790                    break  # No more possible overlaps
791                # If it starts before i's job ends, it's overlapping
792                overlapping.add(j)
793
794        # Mark the overlaps in the dataframe
795        df["overlaps"] = df.index.isin(overlapping)
796
797        # Filter out overlapping jobs
798        df_no_overlap = df[~df["overlaps"]]
799
800        # We will use the ad-hoc unavailability to remove any instances where we already know one of
801        # the vehicles to be recorded as offline
802
803        data = df_no_overlap.copy()
804
805        # TODO: Ideally we'd also remove any instances where we know one of the helos to have been
806        # off for servicing if that data is available
807        ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"])
808        ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower()
809
810        data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601")
811        data["vehicle"] = data["vehicle"].str.lower()
812
813        # Create a cross-join between data and ad_hoc
814        data['key'] = 1
815        ad_hoc['key'] = 1
816        merged = data.merge(ad_hoc, on='key')
817
818        # Keep rows where inc_date falls within the offline period
819        overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])]
820
821        # Filter out those rows from the original data
822        df_no_overlap = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key')
823
824        callsign_df = (
825            df_no_overlap
826            .assign(
827                helicopter_benefit=np.select(
828                    [
829                        df_no_overlap["cc_benefit"] == "y",
830                        df_no_overlap["ec_benefit"] == "y",
831                        df_no_overlap["hems_result"].isin([
832                            "Stand Down En Route",
833                            "Landed but no patient contact",
834                            "Stand Down Before Mobile"
835                        ])
836                    ],
837                    ["y", "y", "n"],
838                    default=df_no_overlap["helicopter_benefit"]
839                ),
840                care_category=np.select(
841                    [
842                        df_no_overlap["cc_benefit"] == "y",
843                        df_no_overlap["ec_benefit"] == "y"
844                    ],
845                    ["CC", "EC"],
846                    default="REG"
847                )
848            )
849        )
850
851        callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other']
852
853        callsign_counts = callsign_df.groupby(['callsign_group']).size().reset_index(name='count')
854
855        total_counts = len(callsign_df)
856        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
857
858        callsign_counts.to_csv('distribution_data/callsign_group_probs.csv', mode = "w+", index=False)

Calculates the probabilty of a specific callsign being allocated to a call

def vehicle_type_by_quarter_probs(self):
 961    def vehicle_type_by_quarter_probs(self):
 962        """
 963
 964            Calculates the probabilty of a car/helicopter being allocated to
 965            a call based on the callsign group and quarter of the year
 966
 967            Quarter accounts for seasonal variation without being as affected by
 968
 969        """
 970        data = self.df.copy()
 971
 972        # We will use the ad-hoc unavailability to remove any instances where we already know one of
 973        # the vehicles to be recorded as offline
 974
 975        # TODO: Ideally we'd also remove any instances where we know one of the helos to have been
 976        # off for servicing if that data is available
 977        ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"])
 978        ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower()
 979
 980        data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601")
 981        data["vehicle"] = data["vehicle"].str.lower()
 982
 983        # Create a cross-join between data and ad_hoc
 984        data['key'] = 1
 985        ad_hoc['key'] = 1
 986        merged = data.merge(ad_hoc, on='key')
 987
 988        # Keep rows where inc_date falls within the offline period
 989        overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])]
 990
 991        # Filter out those rows from the original data
 992        filtered_data = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key')
 993
 994        # First, calculate overall props
 995        callsign_counts = filtered_data.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count')
 996
 997        total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum')
 998        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
 999
1000        callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")
1001
1002        # Then, redo by quarter
1003        callsign_counts = filtered_data.groupby(['callsign_group', 'quarter', 'vehicle_type']).size().reset_index(name='count')
1004
1005        total_counts = callsign_counts.groupby(['callsign_group', 'quarter'])['count'].transform('sum')
1006        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
1007
1008        callsign_counts.to_csv('distribution_data/vehicle_type_by_quarter_probs.csv', mode = "w+")

Calculates the probabilty of a car/helicopter being allocated to a call based on the callsign group and quarter of the year

Quarter accounts for seasonal variation without being as affected by

def vehicle_type_probs(self):
1010    def vehicle_type_probs(self):
1011        """
1012
1013            Calculates the probabilty of a car/helicopter being allocated to
1014            a call based on the callsign group
1015
1016        """
1017
1018        callsign_counts = self.df.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count')
1019
1020        total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum')
1021        callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4)
1022
1023        callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")

Calculates the probabilty of a car/helicopter being allocated to a call based on the callsign group

def hems_result_by_callsign_group_and_vehicle_type_probs(self):
1025    def hems_result_by_callsign_group_and_vehicle_type_probs(self):
1026        """
1027
1028            Calculates the probabilty of a specific HEMS result being allocated to
1029            a call based on the callsign group and hour of day
1030
1031            TODO: These probability calculation functions could probably be refactored into a single
1032            function and just specify columns and output name
1033
1034        """
1035        hems_counts = self.df.groupby(['hems_result', 'callsign_group', 'vehicle_type']).size().reset_index(name='count')
1036
1037        total_counts = hems_counts.groupby(['callsign_group', 'vehicle_type'])['count'].transform('sum')
1038        hems_counts['proportion'] = round(hems_counts['count'] / total_counts, 4)
1039
1040        hems_counts.to_csv('distribution_data/hems_result_by_callsign_group_and_vehicle_type_probs.csv', mode = "w+", index=False)

Calculates the probabilty of a specific HEMS result being allocated to a call based on the callsign group and hour of day

TODO: These probability calculation functions could probably be refactored into a single function and just specify columns and output name

def school_holidays(self) -> None:
1132    def school_holidays(self) -> None:
1133        """"
1134            Function to generate a CSV file containing schoole holiday
1135            start and end dates for a given year. The Year range is determined
1136            by the submitted data (plus a year at the end of the study for good measure)
1137        """
1138
1139        min_date = self.df.inc_date.min()
1140        max_date = self.df.inc_date.max() + timedelta(weeks = (52 * self.school_holidays_years))
1141
1142        u = Utils()
1143
1144        years_of_holidays_list = u.years_between(min_date, max_date)
1145
1146        sh = pd.DataFrame(columns=['year', 'start_date', 'end_date'])
1147
1148        for i, year in enumerate(years_of_holidays_list):
1149            tmp = u.calculate_term_holidays(year)
1150
1151            if i == 0:
1152                sh = tmp
1153            else:
1154                sh = pd.concat([sh, tmp])
1155
1156        sh.to_csv('actual_data/school_holidays.csv', index = False)

" Function to generate a CSV file containing schoole holiday start and end dates for a given year. The Year range is determined by the submitted data (plus a year at the end of the study for good measure)

def historical_jobs_per_day_per_callsign(self):
1161    def historical_jobs_per_day_per_callsign(self):
1162        df = self.df
1163
1164        df["date"] = pd.to_datetime(df["inc_date"]).dt.date
1165        all_counts_hist = df.groupby(["date", "callsign"])["job_id"].count().reset_index()
1166        all_counts_hist.rename(columns={'job_id':'jobs_in_day'}, inplace=True)
1167
1168        all_combinations = pd.DataFrame(
1169            list(itertools.product(df['date'].unique(), df['callsign'].unique())),
1170            columns=['date', 'callsign']
1171        ).dropna()
1172
1173        merged = all_combinations.merge(all_counts_hist, on=['date', 'callsign'], how='left')
1174        merged['jobs_in_day'] = merged['jobs_in_day'].fillna(0).astype(int)
1175
1176        all_counts = merged.groupby(['callsign', 'jobs_in_day']).count().reset_index().rename(columns={"date":"count"})
1177        all_counts.to_csv("historical_data/historical_jobs_per_day_per_callsign.csv", index=False)
def historical_care_cat_counts(self):
1179    def historical_care_cat_counts(self):
1180        """
1181        Process historical incident data to categorize care types and compute hourly counts.
1182
1183        This method performs the following steps:
1184        - Converts incident dates to datetime format.
1185        - Extracts month start and hour from the incident date.
1186        - Categorizes each incident into care categories based on benefit flags and attendance.
1187        - Counts the number of incidents by hour and care category.
1188        - Outputs these counts to a CSV file.
1189        - Computes and writes the proportion of regular care jobs with a helicopter benefit
1190        (excluding those not attended by a DAA resource) to a text file.
1191
1192        Outputs:
1193        - CSV file: 'historical_data/historical_care_cat_counts.csv'
1194        - Text file: 'distribution_data/proportion_jobs_heli_benefit.txt'
1195        """
1196
1197        df_historical = self.df
1198
1199        df_historical['inc_date'] = pd.to_datetime(df_historical['inc_date'])
1200        # Extract the first day of the month and the hour of each incident
1201        df_historical['month_start'] = df_historical.inc_date.dt.strftime("%Y-%m-01")
1202        df_historical['hour'] = df_historical.inc_date.dt.hour
1203
1204        conditions = [
1205            df_historical['cc_benefit'] == 'y',
1206            df_historical['ec_benefit'] == 'y',
1207            df_historical['helicopter_benefit'] == 'y',
1208            df_historical['callsign_group'] == 'Other'
1209        ]
1210
1211        choices = [
1212            'CC',
1213            'EC',
1214            'REG - helicopter benefit',
1215            'Unknown - DAA resource did not attend'
1216        ]
1217        # Assign care category to each record
1218        # If the case did not meet any of the criteria in 'conditions', it will default
1219        # to being labelled as a 'regular/REG' case (i.e there was no benefit recorded)
1220        df_historical['care_category'] = np.select(conditions, choices, default='REG')
1221
1222        # Count occurrences grouped by hour and care category
1223        historical_value_counts_by_hour = (
1224            df_historical.value_counts(["hour", "care_category"])
1225            .reset_index(name="count")
1226            )
1227        # Output to CSV for use in tests and visualisations
1228        (historical_value_counts_by_hour
1229         .sort_values(['hour', 'care_category'])
1230         .to_csv("historical_data/historical_care_cat_counts.csv"))
1231
1232        # Also output the % of regular (not cc/ec) jobs with a helicopter benefit
1233        # These are the regular jobs we will make an assumption follow different logic due to having an obvious expected
1234        # patient benefit of having a helicopter allocated to them that we will have to assume is apparent at the time
1235        # of the call being placed (such as the casualty being located in a remote location, or )
1236
1237        numerator = (historical_value_counts_by_hour[
1238            historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit"
1239            ]["count"].sum())
1240
1241        denominator = (historical_value_counts_by_hour[
1242            (historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit") |
1243            (historical_value_counts_by_hour["care_category"] == "REG")
1244            ]["count"].sum())
1245
1246        with open('distribution_data/proportion_jobs_heli_benefit.txt', 'w+') as heli_benefit_file:
1247            heli_benefit_file.write(json.dumps((numerator/denominator).round(4)))
1248
1249         # Count occurrences grouped by hour and care category
1250        historical_value_counts_by_hour_cc_ec = (
1251            df_historical.value_counts(["hour", "care_category", "helicopter_benefit"])
1252            .reset_index(name="count")
1253            )
1254
1255       # Output to CSV for use in tests and visualisations
1256        # (historical_value_counts_by_hour_cc_ec
1257        #  .sort_values(["hour", "care_category", "helicopter_benefit"])
1258        #  .to_csv("historical_data/historical_care_cat_counts_cc_ec.csv"))
1259
1260
1261        numerator_cc = (
1262            historical_value_counts_by_hour_cc_ec[
1263                (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") &
1264                (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y")
1265                ]["count"].sum())
1266
1267        denominator_cc = (
1268            historical_value_counts_by_hour_cc_ec[
1269                (historical_value_counts_by_hour_cc_ec["care_category"] == "CC")
1270                ]["count"].sum())
1271
1272        with open('distribution_data/proportion_jobs_heli_benefit_cc.txt', 'w+') as heli_benefit_file:
1273            heli_benefit_file.write(json.dumps((numerator_cc/denominator_cc).round(4)))
1274
1275        numerator_ec = (
1276            historical_value_counts_by_hour_cc_ec[
1277                (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") &
1278                (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y")
1279                ]["count"].sum())
1280
1281        denominator_ec = (
1282            historical_value_counts_by_hour_cc_ec[
1283                (historical_value_counts_by_hour_cc_ec["care_category"] == "EC")
1284                ]["count"].sum())
1285
1286        with open('distribution_data/proportion_jobs_heli_benefit_ec.txt', 'w+') as heli_benefit_file:
1287            heli_benefit_file.write(json.dumps((numerator_ec/denominator_ec).round(4)))

Process historical incident data to categorize care types and compute hourly counts.

This method performs the following steps:

  • Converts incident dates to datetime format.
  • Extracts month start and hour from the incident date.
  • Categorizes each incident into care categories based on benefit flags and attendance.
  • Counts the number of incidents by hour and care category.
  • Outputs these counts to a CSV file.
  • Computes and writes the proportion of regular care jobs with a helicopter benefit (excluding those not attended by a DAA resource) to a text file.

Outputs:

  • CSV file: 'historical_data/historical_care_cat_counts.csv'
  • Text file: 'distribution_data/proportion_jobs_heli_benefit.txt'
def historical_monthly_totals(self):
1291    def historical_monthly_totals(self):
1292        """
1293            Calculates monthly incident totals from provided dataset of historical data
1294        """
1295
1296        # Multiple resources can be sent to the same job.
1297        monthly_df = self.df[['inc_date', 'first_day_of_month', 'hems_result', 'vehicle_type']]\
1298            .drop_duplicates(subset="inc_date", keep="first")
1299
1300        is_stand_down = monthly_df['hems_result'].str.contains("Stand Down")
1301        monthly_df['stand_down_car'] = ((monthly_df['vehicle_type'] == "car") & is_stand_down).astype(int)
1302        monthly_df['stand_down_helicopter'] = ((monthly_df['vehicle_type'] == "helicopter") & is_stand_down).astype(int)
1303
1304        monthly_totals_df = monthly_df.groupby('first_day_of_month').agg(
1305                                stand_down_car=('stand_down_car', 'sum'),
1306                                stand_down_helicopter=('stand_down_helicopter', 'sum'),
1307                                total_jobs=('vehicle_type', 'size')
1308                            ).reset_index()
1309
1310        monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_jobs_per_month.csv', mode="w+", index=False)

Calculates monthly incident totals from provided dataset of historical data

def historical_monthly_totals_all_calls(self):
1312    def historical_monthly_totals_all_calls(self):
1313        """
1314            Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1315        """
1316
1317        # Multiple resources can be sent to the same job.
1318        monthly_df = self.df[['inc_date', 'first_day_of_month']].dropna()
1319
1320        monthly_totals_df = monthly_df.groupby(['first_day_of_month']).count().reset_index()
1321
1322        monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_all_calls.csv', mode="w+", index=False)

Calculates monthly incident totals from provided dataset of historical data stratified by callsign

def historical_monthly_totals_by_callsign(self):
1324    def historical_monthly_totals_by_callsign(self):
1325        """
1326            Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1327        """
1328
1329        # Multiple resources can be sent to the same job.
1330        monthly_df = self.df[['inc_date', 'first_day_of_month', 'callsign']].dropna()
1331
1332        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'callsign']).count().reset_index()
1333
1334        #print(monthly_totals_df.head())
1335
1336        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1337
1338        #print(monthly_totals_pivot_df.head())
1339
1340        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_callsign.csv', mode="w+", index=False)

Calculates monthly incident totals from provided dataset of historical data stratified by callsign

def historical_monthly_totals_by_hour_of_day(self):
1342    def historical_monthly_totals_by_hour_of_day(self):
1343        """
1344            Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day
1345        """
1346
1347        # Multiple resources can be sent to the same job.
1348        monthly_df = self.df[['inc_date', 'first_day_of_month', 'hour']].dropna()\
1349            .drop_duplicates(subset="inc_date", keep="first")
1350
1351        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'hour']).count().reset_index()
1352
1353        #print(monthly_totals_df.head())
1354
1355        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='hour', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1356
1357        #print(monthly_totals_pivot_df.head())
1358
1359        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_hour_of_day.csv', mode="w+", index=False)

Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day

def historical_monthly_totals_by_day_of_week(self):
1361    def historical_monthly_totals_by_day_of_week(self):
1362        """
1363            Calculates number of incidents per month stratified by day of the week
1364        """
1365
1366        # Multiple resources can be sent to the same job.
1367        monthly_df = self.df[['inc_date', 'first_day_of_month', 'day_of_week']].dropna()\
1368            .drop_duplicates(subset="inc_date", keep="first")
1369
1370        monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'day_of_week']).count().reset_index()
1371
1372        #print(monthly_totals_df.head())
1373
1374        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='day_of_week', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1)
1375
1376        #print(monthly_totals_pivot_df.head())
1377
1378        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_day_of_week.csv', mode="w+", index=False)

Calculates number of incidents per month stratified by day of the week

def historical_median_time_of_activities_by_month_and_resource_type(self):
1380    def historical_median_time_of_activities_by_month_and_resource_type(self):
1381        """
1382            Calculate the median time for each of the job cycle phases stratified by month and vehicle type
1383        """
1384
1385        median_df = self.df[['first_day_of_month', 'time_allocation',
1386                             'time_mobile', 'time_to_scene', 'time_on_scene',
1387                             'time_to_hospital', 'time_to_clear', 'vehicle_type']].copy()
1388
1389        median_df['total_job_time'] = median_df[[
1390            'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene',
1391            'time_to_hospital', 'time_to_clear']].sum(axis=1, skipna=True)
1392
1393        # Replacing zeros with NaN to exclude from median calculation
1394        # since if an HEMS result is Stood down en route, then time_on_scene would be zero and affect the median
1395        # median_df.replace(0, np.nan, inplace=True)
1396
1397        # Grouping by month and resource_type, calculating medians
1398        median_times = median_df.groupby(['first_day_of_month', 'vehicle_type']).median(numeric_only=True).reset_index()
1399
1400        pivot_data = median_times.pivot_table(
1401            index='first_day_of_month',
1402            columns='vehicle_type',
1403            values=['time_allocation', 'time_mobile', 'time_to_scene',
1404                    'time_on_scene', 'time_to_hospital', 'time_to_clear', 'total_job_time']
1405        )
1406
1407        pivot_data.columns = [f"median_{col[1]}_{col[0]}" for col in pivot_data.columns]
1408        pivot_data = pivot_data.reset_index()
1409
1410        pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)

Calculate the median time for each of the job cycle phases stratified by month and vehicle type

def historical_monthly_resource_utilisation(self):
1412    def historical_monthly_resource_utilisation(self):
1413        """
1414            Calculates number of, and time spent on, incidents per month stratified by callsign
1415        """
1416
1417        # Multiple resources can be sent to the same job.
1418        monthly_df = self.df[[
1419            'inc_date', 'first_day_of_month', 'callsign', 'time_allocation',
1420            'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital',
1421            'time_to_clear']].copy()
1422
1423        monthly_df['total_time'] = monthly_df.filter(regex=r'^time_').sum(axis=1)
1424
1425        monthly_totals_df = monthly_df.groupby(['callsign', 'first_day_of_month'], as_index=False)\
1426            .agg(n = ('callsign', 'size'), total_time = ('total_time', 'sum'))
1427
1428        monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values=['n', 'total_time'])
1429
1430        monthly_totals_pivot_df.columns = [f"{col[0]}_{col[1]}" for col in  monthly_totals_pivot_df.columns]
1431        monthly_totals_pivot_df = monthly_totals_pivot_df.reset_index()
1432
1433        monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_resource_utilisation.csv', mode="w+", index=False)

Calculates number of, and time spent on, incidents per month stratified by callsign

def historical_daily_calls_breakdown(self):
1435    def historical_daily_calls_breakdown(self):
1436
1437        df = self.df
1438        # Convert inc_date to date only (remove time)
1439        df['date'] = pd.to_datetime(df['inc_date']).dt.date
1440
1441        # Count number of calls per day
1442        calls_in_day_breakdown = df.groupby('date').size().reset_index(name='calls_in_day')
1443
1444        # Save the daily call counts with a 'day' index column
1445        calls_in_day_breakdown_with_day = calls_in_day_breakdown.copy()
1446        calls_in_day_breakdown_with_day.insert(0, 'day', range(1, len(calls_in_day_breakdown) + 1))
1447        calls_in_day_breakdown_with_day.drop(columns='date').to_csv('historical_data/historical_daily_calls_breakdown.csv', index=False)
1448
1449        # Count how many days had the same number of calls
1450        calls_per_day_summary = calls_in_day_breakdown['calls_in_day'].value_counts().reset_index()
1451        calls_per_day_summary.columns = ['calls_in_day', 'days']
1452        calls_per_day_summary.to_csv('historical_data/historical_daily_calls.csv', index=False)
def historical_missed_jobs(self):
1454    def historical_missed_jobs(self):
1455        df = self.df
1456        df["date"] = pd.to_datetime(df["inc_date"])
1457        df["hour"] = df["date"].dt.hour
1458        df["month_start"] = df["date"].dt.strftime("%Y-%m-01")
1459        df["callsign_group_simplified"] = df["callsign_group"].apply(lambda x: "No HEMS available" if x=="Other" else "HEMS (helo or car) available and sent")
1460        df["quarter"] = df["inc_date"].dt.quarter
1461
1462        # By month
1463        count_df_month = df[["callsign_group_simplified", "month_start"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','month_start'])
1464        count_df_month.to_csv("historical_data/historical_missed_calls_by_month.csv", index=False)
1465
1466        # By hour
1467        count_df = df[["callsign_group_simplified", "hour"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','hour'])
1468        count_df.to_csv("historical_data/historical_missed_calls_by_hour.csv", index=False)
1469
1470        # By quarter and hour
1471        count_df_quarter = df[["callsign_group_simplified", "quarter", "hour"]].value_counts().reset_index(name="count").sort_values(['quarter','callsign_group_simplified','hour'])
1472        count_df_quarter.to_csv("historical_data/historical_missed_calls_by_quarter_and_hour.csv", index=False)
def upper_allowable_time_bounds(self):
1474    def upper_allowable_time_bounds(self):
1475        """
1476            Calculates the maximum permissable time for each phase on an incident based on supplied historical data.
1477            This is currently set to 1.5x the upper quartile of the data distribution
1478        """
1479
1480        median_df = self.df[[
1481            'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene',
1482            'time_to_hospital', 'time_to_clear', 'vehicle_type']]
1483
1484        # Replacing zeros with NaN to exclude from median calculation
1485        # since if an HEMS result is Stood down en route, then time_on_scene
1486        # would be zero and affect the median
1487        median_df.replace(0, np.nan, inplace=True)
1488
1489        print(median_df.quantile(.75))
1490        # pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)

Calculates the maximum permissable time for each phase on an incident based on supplied historical data. This is currently set to 1.5x the upper quartile of the data distribution

def historical_job_durations_breakdown(self):
1492    def historical_job_durations_breakdown(self):
1493
1494        df = self.df
1495
1496        cols = [
1497            'callsign', 'vehicle_type',
1498            'time_allocation', 'time_mobile',
1499            'time_to_scene', 'time_on_scene',
1500            'time_to_hospital', 'time_to_clear'
1501        ]
1502        df2 = df[cols].copy()
1503
1504        # 2. Add a 1-based row identifier
1505        df2['job_identifier'] = range(1, len(df2) + 1)
1506
1507        # 3. Compute total_duration as the row-wise sum of the time columns
1508        time_cols = [
1509            'time_allocation', 'time_mobile',
1510            'time_to_scene', 'time_on_scene',
1511            'time_to_hospital', 'time_to_clear'
1512        ]
1513        df2['total_duration'] = df2[time_cols].sum(axis=1, skipna=True)
1514
1515        #print(df2.head())
1516
1517        # 4. Pivot (melt) to long format
1518        df_long = df2.melt(
1519            id_vars=['job_identifier', 'callsign', 'vehicle_type'],
1520            value_vars=time_cols + ['total_duration'],
1521            var_name='name',
1522            value_name='value'
1523        )
1524
1525        #print(df_long[df_long.job_identifier == 1])
1526
1527        # 5. Drop any rows where callsign or vehicle_type is missing
1528        df_long = df_long.dropna(subset=['callsign', 'vehicle_type'])
1529        df_long_sorted = df_long.sort_values("job_identifier").reset_index(drop=True)
1530
1531        # 6. Write out to CSV
1532        df_long_sorted.to_csv("historical_data/historical_job_durations_breakdown.csv", index=False)
def ad_hoc_unavailability(self, period_start, period_end):
1883    def ad_hoc_unavailability(self, period_start, period_end):
1884        """
1885        Calculate aircraft availability and unavailability probabilities based on scheduled rotas and ad-hoc downtime.
1886
1887        Args:
1888            period_start (str/datetime): Start date for analysis period
1889            period_end (str/datetime): End date for analysis period
1890            include_debugging_cols (bool): Whether to include debugging columns in output (currently unused)
1891
1892        Returns:
1893            None (saves results to CSV file)
1894        """
1895        # Load and prepare ad-hoc downtime data
1896        adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online'])
1897        adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']]
1898        # Load rota and callsign lookup data
1899        rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv")
1900        callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv")
1901        # Merge rota with callsign lookup to get registration numbers to allow matching with
1902        # ad-hoc data, which uses registrations
1903        full_rota_df = rota_df.merge(callsign_lookup_df, on="callsign")
1904
1905        # Define the hour bands mapping
1906        HOUR_BANDS = {
1907            '00-05': (0, 6),   # 00:00 to 05:59
1908            '06-11': (6, 12),  # 06:00 to 11:59
1909            '12-17': (12, 18), # 12:00 to 17:59
1910            '18-23': (18, 24)  # 18:00 to 23:59
1911        }
1912
1913        # Create list of 6-hour bins
1914        bins = ['00-05', '06-11', '12-17', '18-23']
1915
1916        def is_summer(date_obj, summer_start_month=4, summer_end_month=9):
1917            """
1918            Determine if a date falls in summer months (April-September).
1919
1920            Args:
1921                date_obj: Date object to check
1922
1923            Returns:
1924                bool: True if date is in summer months
1925            """
1926            return date_obj.month in [i for i in range(summer_start_month, summer_end_month+1)]
1927
1928        def check_month_is_summer(month, summer_start_month=4, summer_end_month=9):
1929            """
1930            Determine if a date falls in summer months (April-September).
1931
1932            Args:
1933                month: Integer month
1934
1935            Returns:
1936                str: 'summer' is in summer months, 'winter' if in winter months
1937            """
1938            return 'summer' if month in [i for i in range(summer_start_month, summer_end_month+1)] else 'winter'
1939
1940        def get_band_for_hour(hour):
1941            """
1942            Return the 6-hour band name for a given hour (0-23).
1943
1944            Args:
1945                hour (int): Hour of day (0-23)
1946
1947            Returns:
1948                str or None: Band name or None if hour is invalid
1949            """
1950            for band_name, (start, end) in HOUR_BANDS.items():
1951                if start <= hour < end:
1952                    return band_name
1953            return None
1954
1955        def calculate_minutes_in_band(shift_start, shift_end, band_start, band_end):
1956            """
1957            Calculate how many minutes of a shift fall within a specific time band.
1958
1959            Args:
1960                shift_start (float): Shift start time in hours (0-23.99)
1961                shift_end (float): Shift end time in hours (0-23.99)
1962                band_start (int): Band start hour
1963                band_end (int): Band end hour
1964
1965            Returns:
1966                int: Minutes of overlap between shift and band
1967            """
1968            # Find the overlap between shift and band
1969            overlap_start = max(shift_start, band_start)
1970            overlap_end = min(shift_end, band_end)
1971
1972            if overlap_start < overlap_end:
1973                return int((overlap_end - overlap_start) * 60)  # Convert to minutes
1974            return 0
1975
1976        # Create date range for analysis period
1977        date_range = pd.date_range(start=period_start,
1978                            end=period_end,
1979                            freq='D')
1980        daily_df = pd.DataFrame({'date': date_range})
1981        daily_df['date'] = pd.to_datetime(daily_df['date'])
1982
1983    # Create MultiIndex from date and bins combinations to get all date/time combinations
1984        multi_index = pd.MultiIndex.from_product(
1985            [daily_df['date'], bins],
1986            names=['date', 'six_hour_bin']
1987            )
1988
1989        # Convert MultiIndex to DataFrame and reset index
1990        daily_df = multi_index.to_frame(index=False).reset_index(drop=True)
1991        # Add quarter information for seasonal analysis
1992        daily_df["quarter"] = daily_df.date.dt.quarter
1993
1994        # Initialize availability columns for each aircraft registration
1995        # Each column will store minutes of scheduled time per time band
1996        for registration in full_rota_df['registration'].unique():
1997            daily_df[registration] = 0 # Initialize with 0 minutes
1998
1999        # Calculate scheduled availability for each date/time band combination
2000        for _, row in daily_df.iterrows():
2001            current_date = row['date']
2002            current_band = row['six_hour_bin']
2003            band_start, band_end = HOUR_BANDS[current_band]
2004
2005            is_current_date_summer = is_summer(current_date)
2006
2007            # Get the row index for updating the dataframe
2008            row_idx = daily_df[(daily_df['date'] == current_date) &
2009                            (daily_df['six_hour_bin'] == current_band)].index[0]
2010
2011            # Iterate through each resource's rota entry
2012            for _, rota_entry in full_rota_df.iterrows():
2013                registration = rota_entry['registration']
2014                # Select appropriate start/end times based on season
2015                start_hour_col = 'summer_start' if is_current_date_summer else 'winter_start'
2016                end_hour_col = 'summer_end' if is_current_date_summer else 'winter_end'
2017                start_hour = rota_entry[start_hour_col]
2018                end_hour = rota_entry[end_hour_col]
2019
2020                total_minutes_for_band = 0
2021
2022                if start_hour < end_hour:
2023                    # Shift within same day
2024                    total_minutes_for_band = calculate_minutes_in_band(
2025                        start_hour, end_hour, band_start, band_end
2026                    )
2027                elif start_hour > end_hour:
2028                    # Shift spans midnight - check both parts
2029
2030                    # Part 1: Today from start_hour to midnight
2031                    if band_end <= 24:  # This band is today
2032                        total_minutes_for_band += calculate_minutes_in_band(
2033                            start_hour, 24, band_start, band_end
2034                        )
2035
2036                    # Part 2: Tomorrow from midnight to end_hour
2037                    # Need to check if this band is for tomorrow
2038                    tomorrow = current_date + pd.Timedelta(days=1)
2039                    tomorrow_rows = daily_df[daily_df['date'] == tomorrow]
2040
2041                    if not tomorrow_rows.empty and current_band in tomorrow_rows['six_hour_bin'].values:
2042                        total_minutes_for_band += calculate_minutes_in_band(
2043                            0, end_hour, band_start, band_end
2044                        )
2045
2046                # Update the scheduled time for this aircraft in this time band
2047                daily_df.loc[row_idx, registration] += total_minutes_for_band
2048
2049        # Aggregate scheduled availability by quarter, time band, and registration
2050        available_time = (
2051            daily_df.melt(id_vars=["date", "six_hour_bin", "quarter"], value_name="rota_time", var_name="registration")
2052            .groupby(["quarter", "six_hour_bin", "registration"])[["rota_time"]]
2053            .sum().reset_index()
2054            )
2055
2056        def calculate_availability_row(row, rota_df, callsign_lookup_df):
2057            """
2058            Calculate downtime overlap with scheduled rota for a single downtime event.
2059
2060            Args:
2061                row: DataFrame row containing downtime information
2062                rota_df: DataFrame with rota schedules
2063                callsign_lookup_df: DataFrame mapping callsigns to registrations
2064
2065            Returns:
2066                dict: Dictionary containing processed availability data
2067            """
2068            # Extract downtime information
2069            registration = row['aircraft'].lower()
2070            downtime_start = pd.to_datetime(row['offline'], utc=True)
2071            downtime_end = pd.to_datetime(row['online'], utc=True)
2072            reason = row.get('reason', None)
2073
2074            # Determine which 6-hour bin this downtime starts in
2075            hour = downtime_start.hour
2076            if 0 <= hour <= 5:
2077                six_hour_bin = '00-05'
2078            elif 6 <= hour <= 11:
2079                six_hour_bin = '06-11'
2080            elif 12 <= hour <= 17:
2081                six_hour_bin = '12-17'
2082            else:
2083                six_hour_bin = '18-23'
2084
2085            quarter = downtime_start.quarter
2086
2087            # Find the callsign for this registration
2088            match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration]
2089
2090            # No matching callsign found
2091            if match.empty:
2092                return {
2093                    'registration': registration,
2094                    'offline': downtime_start,
2095                    'online': downtime_end,
2096                    'six_hour_bin': six_hour_bin,
2097                    'quarter': quarter,
2098                    'total_offline': None,
2099                    'reason': reason
2100                }
2101
2102            callsign = match.iloc[0]['callsign']
2103            # Find rota entries for this callsign
2104            rota_rows = rota_df[rota_df['callsign'] == callsign]
2105            if rota_rows.empty:
2106                return {
2107                    'registration': registration,
2108                    'offline': downtime_start,
2109                    'online': downtime_end,
2110                    'six_hour_bin': six_hour_bin,
2111                    'quarter': quarter,
2112                    'total_offline': None,
2113                    'reason': reason
2114                }
2115
2116            # Determine season for appropriate rota times
2117            month = downtime_start.month
2118            season = check_month_is_summer(month)
2119
2120            total_overlap_minutes = 0
2121
2122            # Calculate overlap between downtime and scheduled rota times
2123            for _, rota in rota_rows.iterrows():
2124                start_hour = rota[f'{season}_start']
2125                end_hour = rota[f'{season}_end']
2126
2127                # Check overlap across multiple days (yesterday, today, tomorrow)
2128                # This handles shifts that span midnight
2129                for base_day in [downtime_start.normalize() - timedelta(days=1),
2130                                downtime_start.normalize(),
2131                                downtime_start.normalize() + timedelta(days=1)]:
2132
2133                    rota_start = base_day + timedelta(hours=start_hour)
2134                    rota_end = base_day + timedelta(hours=end_hour)
2135
2136                    # Handle shifts that cross midnight
2137                    if end_hour <= start_hour:
2138                        rota_end += timedelta(days=1)
2139                    # Calculate overlap between downtime and this rota period
2140                    overlap_start = max(downtime_start, rota_start)
2141                    overlap_end = min(downtime_end, rota_end)
2142
2143                    if overlap_end > overlap_start:
2144                        overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60
2145
2146                        total_overlap_minutes += overlap_minutes
2147
2148            return {
2149                'registration': registration,
2150                'offline': downtime_start,
2151                'online': downtime_end,
2152                'six_hour_bin': six_hour_bin,
2153                'quarter': quarter,
2154                'total_offline': total_overlap_minutes,
2155                'reason': reason
2156            }
2157
2158        # Process all ad-hoc downtime events
2159        results = adhoc_df.apply(
2160                        lambda row: calculate_availability_row(
2161                            row, rota_df, callsign_lookup_df),
2162                        axis=1
2163                    )
2164
2165        # Convert results to DataFrame and select relevant columns
2166        unavailability_minutes_df = (
2167            pd.DataFrame(results.tolist())
2168            [["registration", "six_hour_bin", "quarter", "total_offline", "reason"]]
2169            )
2170
2171        # Aggregate offline time by registration, time band, quarter, and reason
2172        offline_durations = unavailability_minutes_df.groupby(
2173            ["registration", "six_hour_bin", "quarter", "reason"]
2174            )[["total_offline"]].sum().reset_index()
2175
2176        # Create complete combinations to ensure all possible categories are represented
2177        # This prevents missing data issues in the final output
2178        registrations = offline_durations['registration'].unique()
2179        six_hour_bins = offline_durations['six_hour_bin'].unique()
2180        quarters = offline_durations['quarter'].unique()
2181        reasons = offline_durations['reason'].unique()
2182
2183        # Generate all possible combinations
2184        all_combinations = list(itertools.product(registrations, six_hour_bins, quarters, reasons))
2185
2186        # Create complete dataframe with all combinations
2187        complete_df = pd.DataFrame(
2188            all_combinations,
2189            columns=['registration', 'six_hour_bin', 'quarter', 'reason']
2190            )
2191
2192        # Merge with original data to preserve existing values, fill missing with 0
2193        offline_durations = complete_df.merge(
2194            offline_durations,
2195            on=['registration', 'six_hour_bin', 'quarter', 'reason'],
2196            how='left'
2197            )
2198
2199        # Fill NaN values with 0 (no downtime for those combinations)
2200        offline_durations['total_offline'] = offline_durations['total_offline'].fillna(0.0)
2201
2202        # Sort for better readability
2203        offline_durations = offline_durations.sort_values(
2204            ['registration', 'six_hour_bin', 'quarter', 'reason']
2205            ).reset_index(drop=True)
2206
2207        # Ensure consistent case for registration names
2208        available_time["registration"] = available_time["registration"].str.lower()
2209        offline_durations["registration"] = offline_durations["registration"].str.lower()
2210
2211        # Merge scheduled time with downtime data
2212        ad_hoc = available_time.merge(offline_durations, on=["registration", "quarter", "six_hour_bin"])
2213        ad_hoc["probability"] = ad_hoc["total_offline"] / ad_hoc["rota_time"]
2214
2215        # Calculate availability probability (1 - sum of all unavailability probabilities)
2216        available_prop_df = ad_hoc.groupby(
2217            ["quarter", "six_hour_bin", "registration", "rota_time"]
2218            )[["probability"]].sum().reset_index()
2219
2220        available_prop_df["reason"] = "available"
2221        available_prop_df["probability"] = 1 - available_prop_df["probability"]
2222
2223        # Combine unavailability and availability data
2224        final_ad_hoc_df = (
2225            pd.concat([ad_hoc, available_prop_df])
2226            .sort_values(["quarter", "six_hour_bin", "registration", "reason"])
2227            )
2228
2229        # Handle cases where there's no scheduled time (set probability to NaN)
2230        final_ad_hoc_df["probability"] = final_ad_hoc_df.apply(
2231            lambda x: np.nan if x["rota_time"]==0 else x["probability"],
2232            axis=1
2233            )
2234
2235        final_ad_hoc_df["probability"] = final_ad_hoc_df["probability"].round(5)
2236
2237        # Save results to CSV
2238        final_ad_hoc_df[["registration", "six_hour_bin", "quarter", "reason", "probability"]].to_csv(
2239            "distribution_data/ad_hoc_unavailability.csv", index=False
2240            )

Calculate aircraft availability and unavailability probabilities based on scheduled rotas and ad-hoc downtime.

Args: period_start (str/datetime): Start date for analysis period period_end (str/datetime): End date for analysis period include_debugging_cols (bool): Whether to include debugging columns in output (currently unused)

Returns: None (saves results to CSV file)

def run_sim_on_historical_params(self):
2243    def run_sim_on_historical_params(self):
2244        # Ensure all rotas are using default values
2245        rota = pd.read_csv("tests/rotas_historic/HISTORIC_HEMS_ROTA.csv")
2246        rota.to_csv("actual_data/HEMS_ROTA.csv", index=False)
2247
2248        callsign_reg_lookup = pd.read_csv("tests/rotas_historic/HISTORIC_callsign_registration_lookup.csv")
2249        callsign_reg_lookup.to_csv("actual_data/callsign_registration_lookup.csv", index=False)
2250
2251        service_history = pd.read_csv("tests/rotas_historic/HISTORIC_service_history.csv")
2252        service_history.to_csv("actual_data/service_history.csv", index=False)
2253
2254        service_sched = pd.read_csv("tests/rotas_historic/HISTORIC_service_schedules_by_model.csv")
2255        service_sched.to_csv("actual_data/service_schedules_by_model.csv", index=False)
2256
2257        print("Generating simulation results...")
2258        removeExistingResults()
2259
2260        total_runs = 30
2261        sim_years = 2
2262        sim_duration = 60 * 24 * 7 * 52 * sim_years
2263
2264        parallelProcessJoblib(
2265            total_runs=total_runs,
2266            sim_duration=sim_duration,
2267            warm_up_time=0,
2268            sim_start_date=datetime.strptime("2023-01-01 05:00:00", "%Y-%m-%d %H:%M:%S"),
2269            amb_data=False,
2270            print_debug_messages=True
2271        )
2272
2273        collateRunResults()
2274
2275        try:
2276            results_all_runs = pd.read_csv("data/run_results.csv")
2277            # results_all_runs.to_csv("historical_data/calculated/SIM_hist_params.csv", index=False)
2278
2279            # save data of counts of suboptimal care category sent
2280            counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", 'hems_res_category', "care_cat"]].value_counts().reset_index()
2281
2282            counts_df_summary = counts_df.groupby(["hems_res_category", "care_cat"])["count"].agg(["mean", "min", "max"]).reset_index()
2283
2284            counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_care_cat_sent_summary.csv")
2285
2286            # save data of counts of suboptimal vehicle type sent
2287            counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", "vehicle_type", "heli_benefit"]].value_counts().reset_index()
2288
2289            counts_df_summary = counts_df.groupby(["vehicle_type", "heli_benefit"])["count"].agg(["mean", "min", "max"]).reset_index()
2290
2291            counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_vehicle_type_sent_summary.csv")
2292
2293
2294            # # Also run the model to get some base-case outputs
2295            # resource_requests = (
2296            #     results_all_runs[results_all_runs["event_type"] == "resource_request_outcome"]
2297            #     .copy()
2298            #     )
2299
2300            # resource_requests["care_cat"] = (
2301            #     resource_requests.apply(lambda x: "REG - Helicopter Benefit" if x["heli_benefit"]=="y"
2302            #                             and x["care_cat"]=="REG" else x["care_cat"],
2303            #                             axis=1)
2304            #                             )
2305
2306            # missed_jobs_care_cat_summary = (
2307            #     resource_requests[["care_cat", "time_type"]].value_counts().reset_index(name="jobs")
2308            #     .sort_values(["care_cat", "time_type"])
2309            #     .copy()
2310            #     )
2311
2312            # missed_jobs_care_cat_summary["jobs_average"] = (
2313            #     missed_jobs_care_cat_summary["jobs"]/
2314            #     total_runs
2315            #     )
2316
2317            # missed_jobs_care_cat_summary["jobs_per_year_average"] = (
2318            #     (missed_jobs_care_cat_summary["jobs_average"] / float(sim_years*365)*365)
2319            #     ).round(0)
2320
2321            missed_jobs_care_cat_summary = _job_outcome_calculation.get_missed_call_df(
2322                    results_all_runs=results_all_runs,
2323                    run_length_days = float(sim_years*365),
2324                    what="summary"
2325                    )
2326
2327            missed_jobs_care_cat_summary.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_summary.csv")
2328
2329
2330            missed_jobs_care_cat_breakdown = _job_outcome_calculation.get_missed_call_df(
2331                    results_all_runs=results_all_runs,
2332                    run_length_days = float(sim_years*365),
2333                    what="breakdown"
2334                    )
2335
2336            missed_jobs_care_cat_breakdown.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_breakdown.csv")
2337
2338        except FileNotFoundError:
2339            pass