distribution_fit_utils
1from csv import QUOTE_ALL 2import glob 3import math 4import os 5import sys 6import numpy as np 7import pandas as pd 8import json 9import itertools 10from fitter import Fitter, get_common_distributions 11from datetime import timedelta 12from utils import Utils 13from des_parallel_process import parallelProcessJoblib, collateRunResults, removeExistingResults 14from datetime import datetime 15import visualisation._job_outcome_calculation as _job_outcome_calculation 16 17os.chdir(os.path.dirname(os.path.realpath(__file__))) 18sys.path.append(os.path.dirname(os.path.realpath(__file__))) 19 20 21class DistributionFitUtils(): 22 """ 23 # The DistributionFitUtils classa 24 25 This class will import a CSV, undertake some light 26 wrangling and then determine distributions and probabilities required 27 for the Discrete Event Simulation 28 29 example usage: 30 my_data = DistributionFitUtils('data/my_data.csv') 31 my_data.import_and_wrangle() 32 33 """ 34 35 def __init__(self, file_path: str, calculate_school_holidays = False, school_holidays_years = 0): 36 37 self.file_path = file_path 38 self.df = pd.DataFrame() 39 40 # The number of additional years of school holidays 41 # that will be calculated over that maximum date in the provided dataset 42 self.school_holidays_years = school_holidays_years 43 self.calculate_school_holidays = calculate_school_holidays 44 45 self.times_to_fit = [ 46 {"hems_result": "Patient Treated but not conveyed by HEMS", 47 "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']}, 48 {"hems_result": "Patient Conveyed by HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']}, 49 {"hems_result": "Patient Conveyed by land with HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']}, 50 {"hems_result": "Stand Down" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_clear']}, 51 {"hems_result": "Landed but no patient contact" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']}, 52 ] 53 54 self.sim_tools_distr_plus = [ 55 "poisson", 56 "bernoulli", 57 "triang", 58 "erlang", 59 "weibull_min", 60 "expon_weib", 61 "betabinom", 62 "pearson3", 63 "cauchy", 64 "chi2", 65 "expon", 66 "exponpow", 67 "gamma", 68 "lognorm", 69 "norm", 70 "powerlaw", 71 "rayleigh", 72 "uniform", 73 "neg_binomial", 74 "zip" 75 ] 76 # SR 16-04-2025 Have hardcoded the common distributions 77 # to make setup for random number generation more robust 78 #+ get_common_distributions() 79 80 def removeExistingResults(self, folder: str) -> None: 81 """ 82 Removes results from previous fitting 83 """ 84 85 matching_files = glob.glob(os.path.join(folder, "*.*")) 86 87 print(matching_files) 88 89 for file in matching_files: 90 os.remove(file) 91 92 93 def getBestFit(self, q_times, distr=get_common_distributions(), show_summary=False): 94 """ 95 96 Convenience function for Fitter. 97 Returns model and parameters that is considered 98 the 'best fit'. 99 100 TODO: Determine how Fitter works this out 101 102 """ 103 104 if(q_times.size > 0): 105 if(len(distr) > 0): 106 f = Fitter(q_times, timeout=60, distributions=distr) 107 else: 108 f = Fitter(q_times, timeout=60) 109 f.fit() 110 if show_summary == True: 111 f.summary() 112 return f.get_best() 113 else: 114 return {} 115 116 117 def import_and_wrangle(self): 118 """ 119 120 Function to import CSV, add additional columns that are required 121 and then sequentially execute other class functions to generate 122 the probabilities and distributions required for the DES. 123 124 TODO: Additional logic is required to check the imported CSV 125 for missing values, incorrect columns names etc. 126 127 """ 128 129 try: 130 df = pd.read_csv(self.file_path, quoting=QUOTE_ALL) 131 self.df = df 132 133 # Perhaps run some kind of checking function here. 134 135 except FileNotFoundError: 136 print(f"Cannot locate that file") 137 138 # If everything is okay, crack on... 139 self.df['inc_date'] = pd.to_datetime(self.df['inc_date']) 140 self.df['date_only'] = pd.to_datetime(df['inc_date'].dt.date) 141 self.df['hour'] = self.df['inc_date'].dt.hour # Hour of the day 142 self.df['day_of_week'] = self.df['inc_date'].dt.day_name() # Day of the week (e.g., Monday) 143 self.df['month'] = self.df['inc_date'].dt.month 144 self.df['quarter'] = self.df['inc_date'].dt.quarter 145 self.df['first_day_of_month'] = self.df['inc_date'].to_numpy().astype('datetime64[M]') 146 147 # Replacing a upper quartile limit on job cycle times and 148 # instead using a manually specified time frame. 149 # This has the advantage of allowing manual amendment of the falues 150 # on the front-end 151 # self.max_values_df = self.upper_allowable_time_bounds() 152 self.min_max_values_df = pd.read_csv('actual_data/upper_allowable_time_bounds.csv') 153 #print(self.min_max_values_df) 154 155 #This will be needed for other datasets, but has already been computed for DAA 156 #self.df['ampds_card'] = self.df['ampds_code'].str[:2] 157 158 self.removeExistingResults(Utils.HISTORICAL_FOLDER) 159 self.removeExistingResults(Utils.DISTRIBUTION_FOLDER) 160 161 #get proportions of AMPDS card by hour of day 162 self.hour_by_ampds_card_probs() 163 164 # Determine 'best' distributions for time-based data 165 self.activity_time_distributions() 166 167 # Calculate probability patient will be female based on AMPDS card 168 self.sex_by_ampds_card_probs() 169 170 # Determine 'best' distributions for age ranges straitifed by AMPDS card 171 self.age_distributions() 172 173 # Alternative approach to IA times. Start with probabilty of call at given hour stratified by quarter 174 self.hourly_arrival_by_qtr_probs() 175 176 # Calculates the mean and standard deviation of the number of incidents per day stratified by quarter 177 self.incidents_per_day() 178 self.incidents_per_day_samples() 179 180 # Calculate probability of enhanced or critical care being required based on AMPDS card 181 self.enhanced_or_critical_care_by_ampds_card_probs() 182 183 # Calculate HEMS result 184 self.hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs() 185 186 # Calculate probability of callsign being allocated to a job based on AMPDS card and hour of day 187 # self.callsign_group_by_ampds_card_and_hour_probs() 188 # self.callsign_group_by_ampds_card_probs() 189 # self.callsign_group_by_care_category() 190 self.callsign_group() 191 192 # Calculate probability of a particular vehicle type based on callsign group and month of year 193 # self.vehicle_type_by_month_probs() 194 self.vehicle_type_by_quarter_probs() 195 # self.vehicle_type_probs() # Similar to previous but without monthly stratification since ad hoc unavailability should account for this. 196 197 # Calculate the patient outcome (conveyed, deceased, unknown) 198 self.patient_outcome_by_care_category_and_quarter_probs() 199 200 # ============= ARCHIVED CODE ================= # 201 # Calculate the mean inter-arrival times stratified by yearly quarter and hour of day 202 # self.inter_arrival_times() 203 # ============= END ARCHIVED CODE ================= # 204 205 206 # ============= ARCHIVED CODE ================= # 207 # Calculate probably of patient outcome 208 # Note - this still needs to be run to support another one? 209 # Calculate probability of a specific patient outcome being allocated to a job based on HEMS result and callsign 210 # self.pt_outcome_by_hems_result_and_care_category_probs() 211 # ============= END ARCHIVED CODE ================= # 212 213 # ============= ARCHIVED CODE ================= # 214 # self.hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs() 215 # ============= END ARCHIVED CODE ================= # 216 217 # ============= ARCHIVED CODE ================= # 218 # Calculate probabily of HEMS result being allocated to a job based on callsign and hour of day 219 # self.hems_result_by_callsign_group_and_vehicle_type_probs() 220 # ============= END ARCHIVED CODE ================= # 221 222 # ============= ARCHIVED CODE ================= # 223 # Calculate probability of HEMS result being allocated to a job based on care category and helicopter benefit 224 # self.hems_result_by_care_cat_and_helicopter_benefit_probs() 225 # ============= END ARCHIVED CODE ================= # 226 227 228 # Calculate school holidays since servicing schedules typically avoid these dates 229 if self.calculate_school_holidays: 230 self.school_holidays() 231 232 # Calculate historical data 233 self.historical_monthly_totals() 234 self.historical_monthly_totals_by_callsign() 235 self.historical_monthly_totals_by_day_of_week() 236 self.historical_median_time_of_activities_by_month_and_resource_type() 237 self.historical_monthly_totals_by_hour_of_day() 238 self.historical_monthly_resource_utilisation() 239 self.historical_monthly_totals_all_calls() 240 self.historical_daily_calls_breakdown() 241 self.historical_job_durations_breakdown() 242 self.historical_missed_jobs() 243 self.historical_jobs_per_day_per_callsign() 244 self.historical_care_cat_counts() 245 246 # Calculate proportions of ad hoc unavailability 247 try: 248 # self.ad_hoc_unavailability() 249 self.ad_hoc_unavailability(period_start="2022-08-01", period_end="2024-07-31") 250 except FileNotFoundError: 251 print("Couldn't find ad-hoc unavailability file") 252 253 def hour_by_ampds_card_probs(self): 254 """ 255 256 Calculates the proportions of calls that are triaged with 257 a specific AMPDS card. This is stratified by hour of day 258 259 TODO: Determine whether this should also be stratified by yearly quarter 260 261 """ 262 category_counts = self.df.groupby(['hour', 'ampds_card']).size().reset_index(name='count') 263 total_counts = category_counts.groupby('hour')['count'].transform('sum') 264 category_counts['proportion'] = round(category_counts['count'] / total_counts, 4) 265 266 #category_counts['ampds_card'] = category_counts['ampds_card'].apply(lambda x: str(x).zfill(2)) 267 268 category_counts.to_csv('distribution_data/hour_by_ampds_card_probs.csv', mode="w+") 269 270 271 def sex_by_ampds_card_probs(self): 272 """ 273 274 Calculates the probability that the patient will be female 275 stratified by AMPDS card. 276 277 """ 278 age_df = self.df 279 category_counts = age_df.groupby(['ampds_card', 'sex']).size().reset_index(name='count') 280 total_counts = category_counts.groupby('ampds_card')['count'].transform('sum') 281 category_counts['proportion'] = round(category_counts['count'] / total_counts, 3) 282 283 category_counts[category_counts['sex'] =='Female'].to_csv('distribution_data/sex_by_ampds_card_probs.csv', mode="w+") 284 285 def activity_time_distributions(self): 286 """ 287 288 Determine the 'best' distribution for each phase of a call 289 i.e. Allocation time, Mobilisation time, Time to scene 290 Time on scene, Travel time to hospital and handover, Time to clear. 291 Not all times will apply to all cases, so the class 'times_to_fit' 292 variable is a list of dictionaries, which contains the times to fit 293 294 The data is currently stratitied by HEMS_result and vehicle type fields. 295 296 """ 297 298 vehicle_type = self.df['vehicle_type'].dropna().unique() 299 300 # We'll need to make sure that where a distribution is missing that the time is set to 0 in the model. 301 # Probably easier than complicated logic to determine what times should be available based on hems_result 302 303 final_distr = [] 304 305 for row in self.times_to_fit: 306 #print(row) 307 for ttf in row['times_to_fit']: 308 for vt in vehicle_type: 309 #print(f"HEMS result is {row['hems_result']} times_to_fit is {ttf} and vehicle type is {vt}") 310 311 # This line might not be required if data quality is determined when importing the data 312 max_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].max_value_mins.iloc[0] 313 min_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].min_value_mins.iloc[0] 314 315 #print(f"Max time is {max_time} and Min time is {min_time}") 316 317 if ttf == 'time_on_scene': 318 # There is virtually no data for HEMS_result other than patient conveyed 319 # which is causing issues with fitting. For time on scene, will 320 # use a simplified fitting ignoring hems_result as a category 321 fit_times = self.df[ 322 (self.df.vehicle_type == vt) & 323 (self.df[ttf] >= min_time) & 324 (self.df[ttf] <= max_time) 325 ][ttf] 326 else: 327 fit_times = self.df[ 328 (self.df.vehicle_type == vt) & 329 (self.df[ttf] >= min_time) & 330 (self.df[ttf] <= max_time) & 331 (self.df.hems_result == row['hems_result']) 332 ][ttf] 333 #print(fit_times[:10]) 334 best_fit = self.getBestFit(fit_times, distr=self.sim_tools_distr_plus) 335 #print(best_fit) 336 337 return_dict = { "vehicle_type": vt, "time_type" : ttf, "best_fit": best_fit, "hems_result": row['hems_result'], "n": len(fit_times)} 338 #print(return_dict) 339 final_distr.append(return_dict) 340 341 with open('distribution_data/activity_time_distributions.txt', 'w+') as convert_file: 342 convert_file.write(json.dumps(final_distr)) 343 convert_file.close() 344 345 346 def age_distributions(self): 347 """ 348 349 Determine the 'best' distribution for age stratified by 350 AMPDS card 351 352 """ 353 354 age_distr = [] 355 356 age_df = self.df[["age", "ampds_card"]].dropna() 357 ampds_cards = age_df['ampds_card'].unique() 358 print(ampds_cards) 359 360 for card in ampds_cards: 361 fit_ages = age_df[age_df['ampds_card'] == card]['age'] 362 best_fit = self.getBestFit(fit_ages, distr=self.sim_tools_distr_plus) 363 return_dict = { "ampds_card": str(card), "best_fit": best_fit, "n": len(fit_ages)} 364 age_distr.append(return_dict) 365 366 with open('distribution_data/age_distributions.txt', 'w+') as convert_file: 367 convert_file.write(json.dumps(age_distr)) 368 convert_file.close() 369 370 371 # def inter_arrival_times(self): 372 # """ 373 374 # Calculate the mean inter-arrival times for patients 375 # stratified by hour, and and yearly quarter 376 377 # """ 378 379 # ia_df = self.df[['date_only', 'quarter', 'hour']].dropna() 380 381 # count_df = ia_df.groupby(['hour', 'date_only', 'quarter']).size().reset_index(name='n') 382 383 # ia_times_df = ( 384 # count_df.groupby(['hour', 'quarter']) 385 # .agg( 386 # # max_arrivals_per_hour=('n', lambda x: round(60 / np.max(x), 3)), 387 # # min_arrivals_per_hour=('n', lambda x: round(60 / np.min(x),3)), 388 # mean_cases=('n', lambda x: round(x.mean(), 1)), 389 # # sd_cases=('n', lambda x: round(x.std(), 3)), 390 # mean_iat=('n', lambda x: 60 / x.mean()) 391 # # n=('n', 'size') 392 # ) 393 # .reset_index() 394 # ) 395 # # Additional column for NSPPThinning 396 # ia_times_df['t'] = ia_times_df['hour'] 397 # ia_times_df['arrival_rate'] = ia_times_df['mean_iat'].apply(lambda x: 1/x) 398 399 # ia_times_df.to_csv('distribution_data/inter_arrival_times.csv', mode='w+') 400 401 402 def incidents_per_day(self): 403 """ 404 Fit distributions for number of incidents per day using actual daily counts, 405 applying year-based weighting to reflect trends (e.g., 2024 busier than 2023), 406 stratified by season and quarter. 407 """ 408 import math 409 import json 410 import numpy as np 411 412 inc_df = self.df[['inc_date', 'date_only', 'quarter']].copy() 413 inc_df['year'] = inc_df['date_only'].dt.year 414 415 # Daily incident counts 416 inc_per_day = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day') 417 inc_per_day['year'] = inc_per_day['date_only'].dt.year 418 419 # Merge quarter and season from self.df 420 date_info = self.df[['date_only', 'quarter']].drop_duplicates() 421 422 if 'season' not in self.df.columns: 423 date_info['season'] = date_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 424 else: 425 date_info = date_info.merge( 426 self.df[['date_only', 'season']].drop_duplicates(), 427 on='date_only', 428 how='left' 429 ) 430 431 inc_per_day = inc_per_day.merge(date_info, on='date_only', how='left') 432 433 # Weight settings - simple implementation rather than biased mean thing 434 year_weights = { 435 2023: 1.0, 436 2024: 4.0 # 10% more weight to 2024 437 } 438 439 # ========== SEASONAL DISTRIBUTIONS ========== 440 jpd_distr = [] 441 442 for season in inc_per_day['season'].dropna().unique(): 443 filtered = inc_per_day[inc_per_day['season'] == season].copy() 444 filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0) 445 446 # Repeat rows proportionally by weight 447 replicated = filtered.loc[ 448 filtered.index.repeat((filtered['weight'] * 10).round().astype(int)) 449 ]['jobs_per_day'] 450 451 best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus) 452 453 jpd_distr.append({ 454 "season": season, 455 "best_fit": best_fit, 456 "min_n_per_day": int(replicated.min()), 457 "max_n_per_day": int(replicated.max()), 458 "mean_n_per_day": float(replicated.mean()) 459 }) 460 461 with open('distribution_data/inc_per_day_distributions.txt', 'w+') as f: 462 json.dump(jpd_distr, f) 463 464 # ========== QUARTERLY DISTRIBUTIONS ========== 465 jpd_qtr_distr = [] 466 467 for quarter in inc_per_day['quarter'].dropna().unique(): 468 filtered = inc_per_day[inc_per_day['quarter'] == quarter].copy() 469 filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0) 470 471 replicated = filtered.loc[ 472 filtered.index.repeat((filtered['weight'] * 10).round().astype(int)) 473 ]['jobs_per_day'] 474 475 best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus) 476 477 jpd_qtr_distr.append({ 478 "quarter": int(quarter), 479 "best_fit": best_fit, 480 "min_n_per_day": int(replicated.min()), 481 "max_n_per_day": int(replicated.max()), 482 "mean_n_per_day": float(replicated.mean()) 483 }) 484 485 with open('distribution_data/inc_per_day_qtr_distributions.txt', 'w+') as f: 486 json.dump(jpd_qtr_distr, f) 487 488 def incidents_per_day_samples(self, weight_map=None, scale_factor=10): 489 """ 490 Create weighted empirical samples of incidents per day by season and quarter. 491 """ 492 493 inc_df = self.df[['date_only', 'quarter']].copy() 494 inc_df['year'] = inc_df['date_only'].dt.year 495 inc_df['season'] = inc_df['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 496 497 # Get raw counts per day 498 daily_counts = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day') 499 daily_counts['year'] = daily_counts['date_only'].dt.year 500 501 # Merge back in season/quarter info 502 meta_info = self.df[['date_only', 'quarter']].drop_duplicates() 503 if 'season' in self.df.columns: 504 meta_info = meta_info.merge( 505 self.df[['date_only', 'season']].drop_duplicates(), 506 on='date_only', how='left' 507 ) 508 else: 509 meta_info['season'] = meta_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 510 511 daily_counts = daily_counts.merge(meta_info, on='date_only', how='left') 512 513 # Year weight map 514 if weight_map is None: 515 weight_map = {2023: 1.0, 2024: 1.1} 516 517 # Compute weights 518 daily_counts['weight'] = daily_counts['year'].map(weight_map).fillna(1.0) 519 520 # Storage 521 empirical_samples = {} 522 523 # Season-based 524 for season in daily_counts['season'].dropna().unique(): 525 filtered = daily_counts[daily_counts['season'] == season].copy() 526 repeated = filtered.loc[ 527 filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int)) 528 ]['jobs_per_day'].tolist() 529 530 empirical_samples[season] = repeated 531 532 # Quarter-based 533 for quarter in daily_counts['quarter'].dropna().unique(): 534 filtered = daily_counts[daily_counts['quarter'] == quarter].copy() 535 repeated = filtered.loc[ 536 filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int)) 537 ]['jobs_per_day'].tolist() 538 539 empirical_samples[f"Q{int(quarter)}"] = repeated 540 541 with open("distribution_data/inc_per_day_samples.json", 'w') as f: 542 json.dump(empirical_samples, f) 543 544 545 def enhanced_or_critical_care_by_ampds_card_probs(self): 546 """ 547 548 Calculates the probabilty of enhanced or critical care resource beign required 549 based on the AMPDS card 550 551 """ 552 553 ec_df = self.df[['ampds_card', 'ec_benefit', 'cc_benefit']].copy() 554 555 def assign_care_category(row): 556 # There are some columns with both EC and CC benefit selected 557 # this function will allocate to only 1 558 if row['cc_benefit'] == 'y': 559 return 'CC' 560 elif row['ec_benefit'] == 'y': 561 return 'EC' 562 else: 563 return 'REG' 564 565 ec_df['care_category'] = ec_df.apply(assign_care_category, axis = 1) 566 567 care_cat_counts = ec_df.groupby(['ampds_card', 'care_category']).size().reset_index(name='count') 568 total_counts = care_cat_counts.groupby('ampds_card')['count'].transform('sum') 569 570 care_cat_counts['proportion'] = round(care_cat_counts['count'] / total_counts, 3) 571 572 care_cat_counts.to_csv('distribution_data/enhanced_or_critical_care_by_ampds_card_probs.csv', mode = "w+", index = False) 573 574 def patient_outcome_by_care_category_and_quarter_probs(self): 575 """ 576 577 Calculates the probabilty of a patient outcome based on care category and yearly quarter 578 579 """ 580 581 po_df = self.df[['quarter', 'ec_benefit', 'cc_benefit', 'pt_outcome']].copy() 582 583 def assign_care_category(row): 584 # There are some columns with both EC and CC benefit selected 585 # this function will allocate to only 1 586 if row['cc_benefit'] == 'y': 587 return 'CC' 588 elif row['ec_benefit'] == 'y': 589 return 'EC' 590 else: 591 return 'REG' 592 593 # There are some values that are missing e.g. CC quarter 1 Deceased 594 # I think we've had problems when trying to sample from this kind of thing before 595 # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0 596 outcomes = po_df['pt_outcome'].unique() 597 care_categories = ['CC', 'EC', 'REG'] 598 quarters = po_df['quarter'].unique() 599 600 all_combinations = pd.DataFrame(list(itertools.product(outcomes, care_categories, quarters)), 601 columns=['pt_outcome', 'care_category', 'quarter']) 602 603 po_df['care_category'] = po_df.apply(assign_care_category, axis = 1) 604 605 po_cat_counts = po_df.groupby(['pt_outcome', 'care_category', 'quarter']).size().reset_index(name='count') 606 607 merged = pd.merge(all_combinations, po_cat_counts, 608 on=['pt_outcome', 'care_category', 'quarter'], 609 how='left').fillna({'count': 0}) 610 merged['count'] = merged['count'].astype(int) 611 612 total_counts = merged.groupby(['care_category', 'quarter'])['count'].transform('sum') 613 merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3) 614 615 merged.to_csv('distribution_data/patient_outcome_by_care_category_and_quarter_probs.csv', mode = "w+", index = False) 616 617 # def hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs(self): 618 # """ 619 620 # Calculates the probabilty of a given HEMS result based on 621 # patient outcome, yearly quarter, vehicle type and callsign group 622 623 # """ 624 625 # hr_df = self.df[['hems_result', 'quarter', 'pt_outcome', 'vehicle_type', 'callsign_group']].copy() 626 627 # # There are some values that are missing e.g. CC quarter 1 Deceased 628 # # I think we've had problems when trying to sample from this kind of thing before 629 # # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0 630 # # hems_results = hr_df['hems_result'].unique() 631 # # outcomes = hr_df['pt_outcome'].unique() 632 # # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)] 633 # # callsign_group_categories = hr_df['callsign_group'].unique() 634 # # quarters = hr_df['quarter'].unique() 635 636 # # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)), 637 # # columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']) 638 639 # hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']).size().reset_index(name='count') 640 641 # # merged = pd.merge(all_combinations, hr_cat_counts, 642 # # on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'], 643 # # how='left').fillna({'count': 0}) 644 # # merged['count'] = merged['count'].astype(int) 645 646 # merged = hr_cat_counts 647 648 # total_counts = merged.groupby(['pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])['count'].transform('sum') 649 # merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3) 650 651 # merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False) 652 653 def hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs(self): 654 """ 655 656 Calculates the probabilty of a given HEMS result based on 657 - patient outcome 658 - yearly quarter 659 - time of day (7am - 6pm, 7pm - 6am) 660 - vehicle type 661 - and callsign group 662 663 """ 664 self.df['inc_date'] = pd.to_datetime(self.df['inc_date']) 665 self.df['hour'] = self.df['inc_date'].dt.hour 666 self.df['time_of_day'] = self.df['hour'].apply(lambda x: 'day' if x >= 7 and x <= 18 else "night") 667 668 hr_df = self.df[[ 669 'hems_result', 'quarter', 'pt_outcome', 670 'vehicle_type', 'callsign_group', 'time_of_day' 671 ]].copy() 672 673 # There are some values that are missing e.g. CC quarter 1 Deceased 674 # I think we've had problems when trying to sample from this kind of thing before 675 # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0 676 # hems_results = hr_df['hems_result'].unique() 677 # outcomes = hr_df['pt_outcome'].unique() 678 # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)] 679 # callsign_group_categories = hr_df['callsign_group'].unique() 680 # quarters = hr_df['quarter'].unique() 681 682 # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)), 683 # columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']) 684 685 hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome', 686 'vehicle_type', 'callsign_group', 687 'quarter', 'time_of_day']).size().reset_index(name='count') 688 689 # merged = pd.merge(all_combinations, hr_cat_counts, 690 # on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'], 691 # how='left').fillna({'count': 0}) 692 # merged['count'] = merged['count'].astype(int) 693 694 merged = hr_cat_counts 695 696 total_counts = merged.groupby(['pt_outcome', 697 'vehicle_type', 'callsign_group', 698 'quarter', 'time_of_day'])['count'].transform('sum') 699 merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3) 700 701 merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False) 702 703 704 705 def hourly_arrival_by_qtr_probs(self): 706 """ 707 708 Calculates the proportions of calls arriving in any given hour 709 stratified by yearly quarter 710 711 """ 712 713 ia_df = self.df[['quarter', 'hour']].dropna() 714 715 hourly_counts = ia_df.groupby(['hour', 'quarter']).size().reset_index(name='count') 716 total_counts = hourly_counts.groupby(['quarter'])['count'].transform('sum') 717 hourly_counts['proportion'] = round(hourly_counts['count'] / total_counts, 4) 718 719 hourly_counts.sort_values(by=['quarter', 'hour']).to_csv('distribution_data/hourly_arrival_by_qtr_probs.csv', mode="w+") 720 721 722 def callsign_group_by_ampds_card_and_hour_probs(self): 723 """ 724 725 Calculates the probabilty of a specific callsign being allocated to 726 a call based on the AMPDS card category and hour of day 727 728 """ 729 callsign_counts = self.df.groupby(['ampds_card', 'hour', 'callsign_group']).size().reset_index(name='count') 730 731 total_counts = callsign_counts.groupby(['ampds_card', 'hour'])['count'].transform('sum') 732 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 733 734 callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_and_hour_probs.csv', mode = "w+", index=False) 735 736 def callsign_group_by_ampds_card_probs(self): 737 """ 738 739 Calculates the probabilty of a specific callsign being allocated to 740 a call based on the AMPDS card category 741 742 """ 743 744 callsign_df = self.df[self.df['callsign_group'] != 'Other'] 745 746 callsign_counts = callsign_df.groupby(['ampds_card', 'callsign_group']).size().reset_index(name='count') 747 748 total_counts = callsign_counts.groupby(['ampds_card'])['count'].transform('sum') 749 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 750 751 callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_probs.csv', mode = "w+", index=False) 752 753 def callsign_group(self): 754 """ 755 Calculates the probabilty of a specific callsign being allocated to 756 a call 757 """ 758 df = self.df.copy() 759 760 # Convert time fields to numeric 761 time_fields = [ 762 "time_allocation", "time_mobile", "time_to_scene", 763 "time_on_scene", "time_to_hospital", "time_to_clear" 764 ] 765 for col in time_fields: 766 df[col] = pd.to_numeric(df[col], errors="coerce") 767 768 # Calculate total job duration in minutes 769 df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True) 770 771 # Compute job start and end times 772 df["start_time"] = df["inc_date"] 773 df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m") 774 775 # Sort jobs by start time 776 df = df.sort_values(by="start_time").reset_index(drop=True) 777 778 # Set to hold indices of jobs that overlap (but only the later-starting ones) 779 overlapping = set() 780 781 # Check for overlaps 782 for i in range(len(df)): 783 this_end = df.at[i, "end_time"] 784 785 # Compare only to later jobs 786 for j in range(i + 1, len(df)): 787 next_start = df.at[j, "start_time"] 788 if next_start >= this_end: 789 break # No more possible overlaps 790 # If it starts before i's job ends, it's overlapping 791 overlapping.add(j) 792 793 # Mark the overlaps in the dataframe 794 df["overlaps"] = df.index.isin(overlapping) 795 796 # Filter out overlapping jobs 797 df_no_overlap = df[~df["overlaps"]] 798 799 # We will use the ad-hoc unavailability to remove any instances where we already know one of 800 # the vehicles to be recorded as offline 801 802 data = df_no_overlap.copy() 803 804 # TODO: Ideally we'd also remove any instances where we know one of the helos to have been 805 # off for servicing if that data is available 806 ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"]) 807 ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower() 808 809 data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601") 810 data["vehicle"] = data["vehicle"].str.lower() 811 812 # Create a cross-join between data and ad_hoc 813 data['key'] = 1 814 ad_hoc['key'] = 1 815 merged = data.merge(ad_hoc, on='key') 816 817 # Keep rows where inc_date falls within the offline period 818 overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])] 819 820 # Filter out those rows from the original data 821 df_no_overlap = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key') 822 823 callsign_df = ( 824 df_no_overlap 825 .assign( 826 helicopter_benefit=np.select( 827 [ 828 df_no_overlap["cc_benefit"] == "y", 829 df_no_overlap["ec_benefit"] == "y", 830 df_no_overlap["hems_result"].isin([ 831 "Stand Down En Route", 832 "Landed but no patient contact", 833 "Stand Down Before Mobile" 834 ]) 835 ], 836 ["y", "y", "n"], 837 default=df_no_overlap["helicopter_benefit"] 838 ), 839 care_category=np.select( 840 [ 841 df_no_overlap["cc_benefit"] == "y", 842 df_no_overlap["ec_benefit"] == "y" 843 ], 844 ["CC", "EC"], 845 default="REG" 846 ) 847 ) 848 ) 849 850 callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other'] 851 852 callsign_counts = callsign_df.groupby(['callsign_group']).size().reset_index(name='count') 853 854 total_counts = len(callsign_df) 855 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 856 857 callsign_counts.to_csv('distribution_data/callsign_group_probs.csv', mode = "w+", index=False) 858 859 860 # def callsign_group_by_care_category(self): 861 # """ 862 863 # Calculates the probabilty of a specific callsign being allocated to 864 # a call based on the care category 865 866 # """ 867 # df = self.df.copy() 868 869 # # Convert time fields to numeric 870 # time_fields = [ 871 # "time_allocation", "time_mobile", "time_to_scene", 872 # "time_on_scene", "time_to_hospital", "time_to_clear" 873 # ] 874 # for col in time_fields: 875 # df[col] = pd.to_numeric(df[col], errors="coerce") 876 877 # # Calculate total job duration in minutes 878 # df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True) 879 880 # # Compute job start and end times 881 # df["start_time"] = df["inc_date"] 882 # df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m") 883 884 # # Sort jobs by start time 885 # df = df.sort_values(by="start_time").reset_index(drop=True) 886 887 # # Set to hold indices of jobs that overlap (but only the later-starting ones) 888 # overlapping = set() 889 890 # # Check for overlaps 891 # for i in range(len(df)): 892 # this_end = df.at[i, "end_time"] 893 894 # # Compare only to later jobs 895 # for j in range(i + 1, len(df)): 896 # next_start = df.at[j, "start_time"] 897 # if next_start >= this_end: 898 # break # No more possible overlaps 899 # # If it starts before i's job ends, it's overlapping 900 # overlapping.add(j) 901 902 # # Mark the overlaps in the dataframe 903 # df["overlaps"] = df.index.isin(overlapping) 904 905 # # Filter out overlapping jobs 906 # df_no_overlap = df[~df["overlaps"]] 907 908 909 # callsign_df = ( 910 # df_no_overlap 911 # .assign( 912 # helicopter_benefit=np.select( 913 # [ 914 # df_no_overlap["cc_benefit"] == "y", 915 # df_no_overlap["ec_benefit"] == "y", 916 # df_no_overlap["hems_result"].isin([ 917 # "Stand Down En Route", 918 # "Landed but no patient contact", 919 # "Stand Down Before Mobile" 920 # ]) 921 # ], 922 # ["y", "y", "n"], 923 # default=df_no_overlap["helicopter_benefit"] 924 # ), 925 # care_category=np.select( 926 # [ 927 # df_no_overlap["cc_benefit"] == "y", 928 # df_no_overlap["ec_benefit"] == "y" 929 # ], 930 # ["CC", "EC"], 931 # default="REG" 932 # ) 933 # ) 934 # ) 935 936 # callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other'] 937 938 # callsign_counts = callsign_df.groupby(['care_category', 'callsign_group']).size().reset_index(name='count') 939 940 # total_counts = callsign_counts.groupby(['care_category'])['count'].transform('sum') 941 # callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 942 943 # callsign_counts.to_csv('distribution_data/callsign_group_by_care_category_probs.csv', mode = "w+", index=False) 944 945 #========== ARCHIVED CODE ============ # 946 # def vehicle_type_by_month_probs(self): 947 # """ 948 949 # Calculates the probabilty of a car/helicopter being allocated to 950 # a call based on the callsign group and month of the year 951 952 # """ 953 # callsign_counts = self.df.groupby(['callsign_group', 'month', 'vehicle_type']).size().reset_index(name='count') 954 955 # total_counts = callsign_counts.groupby(['callsign_group', 'month'])['count'].transform('sum') 956 # callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 957 958 # callsign_counts.to_csv('distribution_data/vehicle_type_by_month_probs.csv', mode = "w+") 959 #========== END ARCHIVED CODE ============ # 960 def vehicle_type_by_quarter_probs(self): 961 """ 962 963 Calculates the probabilty of a car/helicopter being allocated to 964 a call based on the callsign group and quarter of the year 965 966 Quarter accounts for seasonal variation without being as affected by 967 968 """ 969 data = self.df.copy() 970 971 # We will use the ad-hoc unavailability to remove any instances where we already know one of 972 # the vehicles to be recorded as offline 973 974 # TODO: Ideally we'd also remove any instances where we know one of the helos to have been 975 # off for servicing if that data is available 976 ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"]) 977 ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower() 978 979 data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601") 980 data["vehicle"] = data["vehicle"].str.lower() 981 982 # Create a cross-join between data and ad_hoc 983 data['key'] = 1 984 ad_hoc['key'] = 1 985 merged = data.merge(ad_hoc, on='key') 986 987 # Keep rows where inc_date falls within the offline period 988 overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])] 989 990 # Filter out those rows from the original data 991 filtered_data = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key') 992 993 # First, calculate overall props 994 callsign_counts = filtered_data.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count') 995 996 total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum') 997 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 998 999 callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+") 1000 1001 # Then, redo by quarter 1002 callsign_counts = filtered_data.groupby(['callsign_group', 'quarter', 'vehicle_type']).size().reset_index(name='count') 1003 1004 total_counts = callsign_counts.groupby(['callsign_group', 'quarter'])['count'].transform('sum') 1005 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 1006 1007 callsign_counts.to_csv('distribution_data/vehicle_type_by_quarter_probs.csv', mode = "w+") 1008 1009 def vehicle_type_probs(self): 1010 """ 1011 1012 Calculates the probabilty of a car/helicopter being allocated to 1013 a call based on the callsign group 1014 1015 """ 1016 1017 callsign_counts = self.df.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count') 1018 1019 total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum') 1020 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 1021 1022 callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+") 1023 1024 def hems_result_by_callsign_group_and_vehicle_type_probs(self): 1025 """ 1026 1027 Calculates the probabilty of a specific HEMS result being allocated to 1028 a call based on the callsign group and hour of day 1029 1030 TODO: These probability calculation functions could probably be refactored into a single 1031 function and just specify columns and output name 1032 1033 """ 1034 hems_counts = self.df.groupby(['hems_result', 'callsign_group', 'vehicle_type']).size().reset_index(name='count') 1035 1036 total_counts = hems_counts.groupby(['callsign_group', 'vehicle_type'])['count'].transform('sum') 1037 hems_counts['proportion'] = round(hems_counts['count'] / total_counts, 4) 1038 1039 hems_counts.to_csv('distribution_data/hems_result_by_callsign_group_and_vehicle_type_probs.csv', mode = "w+", index=False) 1040 1041 1042 #========== ARCHIVED CODE ============ # 1043 # def hems_result_by_care_cat_and_helicopter_benefit_probs(self): 1044 # """ 1045 1046 # Calculates the probabilty of a specific HEMS result being allocated to 1047 # a call based on the care category amd whether a helicopter is beneficial 1048 1049 # """ 1050 1051 # # Wrangle the data...trying numpy for a change 1052 1053 # hems_df = ( 1054 # self.df 1055 # .assign( 1056 # helicopter_benefit=np.select( 1057 # [ 1058 # self.df["cc_benefit"] == "y", 1059 # self.df["ec_benefit"] == "y", 1060 # self.df["hems_result"].isin([ 1061 # "Stand Down En Route", 1062 # "Landed but no patient contact", 1063 # "Stand Down Before Mobile" 1064 # ]) 1065 # ], 1066 # ["y", "y", "n"], 1067 # default=self.df["helicopter_benefit"] 1068 # ), 1069 # care_cat=np.select( 1070 # [ 1071 # self.df["cc_benefit"] == "y", 1072 # self.df["ec_benefit"] == "y" 1073 # ], 1074 # ["CC", "EC"], 1075 # default="REG" 1076 # ) 1077 # ) 1078 # ) 1079 1080 # hems_counts = hems_df.groupby(['hems_result', 'care_cat', 'helicopter_benefit']).size().reset_index(name='count') 1081 1082 # hems_counts['total'] = hems_counts.groupby(['care_cat', 'helicopter_benefit'])['count'].transform('sum') 1083 # hems_counts['proportion'] = round(hems_counts['count'] / hems_counts['total'], 4) 1084 1085 # hems_counts.to_csv('distribution_data/hems_result_by_care_cat_and_helicopter_benefit_probs.csv', mode = "w+", index=False) 1086 #========== END ARCHIVED CODE ============ # 1087 1088 #========== ARCHIVED CODE ============ # 1089 # def pt_outcome_by_hems_result_and_care_category_probs(self): 1090 # """ 1091 1092 # Calculates the probabilty of a specific patient outcome based on HEMS result 1093 1094 # """ 1095 1096 # hems_df = ( 1097 # self.df 1098 # .assign( 1099 # helicopter_benefit=np.select( 1100 # [ 1101 # self.df["cc_benefit"] == "y", 1102 # self.df["ec_benefit"] == "y", 1103 # self.df["hems_result"].isin([ 1104 # "Stand Down En Route", 1105 # "Landed but no patient contact", 1106 # "Stand Down Before Mobile" 1107 # ]) 1108 # ], 1109 # ["y", "y", "n"], 1110 # default=self.df["helicopter_benefit"] 1111 # ), 1112 # care_category=np.select( 1113 # [ 1114 # self.df["cc_benefit"] == "y", 1115 # self.df["ec_benefit"] == "y" 1116 # ], 1117 # ["CC", "EC"], 1118 # default="REG" 1119 # ) 1120 # ) 1121 # ) 1122 1123 # po_counts = hems_df.groupby(['pt_outcome', 'hems_result', 'care_category']).size().reset_index(name='count') 1124 1125 # po_counts['total'] = po_counts.groupby(['hems_result', 'care_category'])['count'].transform('sum') 1126 # po_counts['proportion'] = round(po_counts['count'] / po_counts['total'], 4) 1127 1128 # po_counts.to_csv('distribution_data/pt_outcome_by_hems_result_and_care_category_probs.csv', mode = "w+") 1129 #========== END ARCHIVED CODE ============ # 1130 1131 def school_holidays(self) -> None: 1132 """" 1133 Function to generate a CSV file containing schoole holiday 1134 start and end dates for a given year. The Year range is determined 1135 by the submitted data (plus a year at the end of the study for good measure) 1136 """ 1137 1138 min_date = self.df.inc_date.min() 1139 max_date = self.df.inc_date.max() + timedelta(weeks = (52 * self.school_holidays_years)) 1140 1141 u = Utils() 1142 1143 years_of_holidays_list = u.years_between(min_date, max_date) 1144 1145 sh = pd.DataFrame(columns=['year', 'start_date', 'end_date']) 1146 1147 for i, year in enumerate(years_of_holidays_list): 1148 tmp = u.calculate_term_holidays(year) 1149 1150 if i == 0: 1151 sh = tmp 1152 else: 1153 sh = pd.concat([sh, tmp]) 1154 1155 sh.to_csv('actual_data/school_holidays.csv', index = False) 1156 1157 1158# These functions are to wrangle historical data to provide comparison against the simulation outputs 1159 1160 def historical_jobs_per_day_per_callsign(self): 1161 df = self.df 1162 1163 df["date"] = pd.to_datetime(df["inc_date"]).dt.date 1164 all_counts_hist = df.groupby(["date", "callsign"])["job_id"].count().reset_index() 1165 all_counts_hist.rename(columns={'job_id':'jobs_in_day'}, inplace=True) 1166 1167 all_combinations = pd.DataFrame( 1168 list(itertools.product(df['date'].unique(), df['callsign'].unique())), 1169 columns=['date', 'callsign'] 1170 ).dropna() 1171 1172 merged = all_combinations.merge(all_counts_hist, on=['date', 'callsign'], how='left') 1173 merged['jobs_in_day'] = merged['jobs_in_day'].fillna(0).astype(int) 1174 1175 all_counts = merged.groupby(['callsign', 'jobs_in_day']).count().reset_index().rename(columns={"date":"count"}) 1176 all_counts.to_csv("historical_data/historical_jobs_per_day_per_callsign.csv", index=False) 1177 1178 def historical_care_cat_counts(self): 1179 """ 1180 Process historical incident data to categorize care types and compute hourly counts. 1181 1182 This method performs the following steps: 1183 - Converts incident dates to datetime format. 1184 - Extracts month start and hour from the incident date. 1185 - Categorizes each incident into care categories based on benefit flags and attendance. 1186 - Counts the number of incidents by hour and care category. 1187 - Outputs these counts to a CSV file. 1188 - Computes and writes the proportion of regular care jobs with a helicopter benefit 1189 (excluding those not attended by a DAA resource) to a text file. 1190 1191 Outputs: 1192 - CSV file: 'historical_data/historical_care_cat_counts.csv' 1193 - Text file: 'distribution_data/proportion_jobs_heli_benefit.txt' 1194 """ 1195 1196 df_historical = self.df 1197 1198 df_historical['inc_date'] = pd.to_datetime(df_historical['inc_date']) 1199 # Extract the first day of the month and the hour of each incident 1200 df_historical['month_start'] = df_historical.inc_date.dt.strftime("%Y-%m-01") 1201 df_historical['hour'] = df_historical.inc_date.dt.hour 1202 1203 conditions = [ 1204 df_historical['cc_benefit'] == 'y', 1205 df_historical['ec_benefit'] == 'y', 1206 df_historical['helicopter_benefit'] == 'y', 1207 df_historical['callsign_group'] == 'Other' 1208 ] 1209 1210 choices = [ 1211 'CC', 1212 'EC', 1213 'REG - helicopter benefit', 1214 'Unknown - DAA resource did not attend' 1215 ] 1216 # Assign care category to each record 1217 # If the case did not meet any of the criteria in 'conditions', it will default 1218 # to being labelled as a 'regular/REG' case (i.e there was no benefit recorded) 1219 df_historical['care_category'] = np.select(conditions, choices, default='REG') 1220 1221 # Count occurrences grouped by hour and care category 1222 historical_value_counts_by_hour = ( 1223 df_historical.value_counts(["hour", "care_category"]) 1224 .reset_index(name="count") 1225 ) 1226 # Output to CSV for use in tests and visualisations 1227 (historical_value_counts_by_hour 1228 .sort_values(['hour', 'care_category']) 1229 .to_csv("historical_data/historical_care_cat_counts.csv")) 1230 1231 # Also output the % of regular (not cc/ec) jobs with a helicopter benefit 1232 # These are the regular jobs we will make an assumption follow different logic due to having an obvious expected 1233 # patient benefit of having a helicopter allocated to them that we will have to assume is apparent at the time 1234 # of the call being placed (such as the casualty being located in a remote location, or ) 1235 1236 numerator = (historical_value_counts_by_hour[ 1237 historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit" 1238 ]["count"].sum()) 1239 1240 denominator = (historical_value_counts_by_hour[ 1241 (historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit") | 1242 (historical_value_counts_by_hour["care_category"] == "REG") 1243 ]["count"].sum()) 1244 1245 with open('distribution_data/proportion_jobs_heli_benefit.txt', 'w+') as heli_benefit_file: 1246 heli_benefit_file.write(json.dumps((numerator/denominator).round(4))) 1247 1248 # Count occurrences grouped by hour and care category 1249 historical_value_counts_by_hour_cc_ec = ( 1250 df_historical.value_counts(["hour", "care_category", "helicopter_benefit"]) 1251 .reset_index(name="count") 1252 ) 1253 1254 # Output to CSV for use in tests and visualisations 1255 # (historical_value_counts_by_hour_cc_ec 1256 # .sort_values(["hour", "care_category", "helicopter_benefit"]) 1257 # .to_csv("historical_data/historical_care_cat_counts_cc_ec.csv")) 1258 1259 1260 numerator_cc = ( 1261 historical_value_counts_by_hour_cc_ec[ 1262 (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") & 1263 (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y") 1264 ]["count"].sum()) 1265 1266 denominator_cc = ( 1267 historical_value_counts_by_hour_cc_ec[ 1268 (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") 1269 ]["count"].sum()) 1270 1271 with open('distribution_data/proportion_jobs_heli_benefit_cc.txt', 'w+') as heli_benefit_file: 1272 heli_benefit_file.write(json.dumps((numerator_cc/denominator_cc).round(4))) 1273 1274 numerator_ec = ( 1275 historical_value_counts_by_hour_cc_ec[ 1276 (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") & 1277 (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y") 1278 ]["count"].sum()) 1279 1280 denominator_ec = ( 1281 historical_value_counts_by_hour_cc_ec[ 1282 (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") 1283 ]["count"].sum()) 1284 1285 with open('distribution_data/proportion_jobs_heli_benefit_ec.txt', 'w+') as heli_benefit_file: 1286 heli_benefit_file.write(json.dumps((numerator_ec/denominator_ec).round(4))) 1287 1288 1289 1290 def historical_monthly_totals(self): 1291 """ 1292 Calculates monthly incident totals from provided dataset of historical data 1293 """ 1294 1295 # Multiple resources can be sent to the same job. 1296 monthly_df = self.df[['inc_date', 'first_day_of_month', 'hems_result', 'vehicle_type']]\ 1297 .drop_duplicates(subset="inc_date", keep="first") 1298 1299 is_stand_down = monthly_df['hems_result'].str.contains("Stand Down") 1300 monthly_df['stand_down_car'] = ((monthly_df['vehicle_type'] == "car") & is_stand_down).astype(int) 1301 monthly_df['stand_down_helicopter'] = ((monthly_df['vehicle_type'] == "helicopter") & is_stand_down).astype(int) 1302 1303 monthly_totals_df = monthly_df.groupby('first_day_of_month').agg( 1304 stand_down_car=('stand_down_car', 'sum'), 1305 stand_down_helicopter=('stand_down_helicopter', 'sum'), 1306 total_jobs=('vehicle_type', 'size') 1307 ).reset_index() 1308 1309 monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_jobs_per_month.csv', mode="w+", index=False) 1310 1311 def historical_monthly_totals_all_calls(self): 1312 """ 1313 Calculates monthly incident totals from provided dataset of historical data stratified by callsign 1314 """ 1315 1316 # Multiple resources can be sent to the same job. 1317 monthly_df = self.df[['inc_date', 'first_day_of_month']].dropna() 1318 1319 monthly_totals_df = monthly_df.groupby(['first_day_of_month']).count().reset_index() 1320 1321 monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_all_calls.csv', mode="w+", index=False) 1322 1323 def historical_monthly_totals_by_callsign(self): 1324 """ 1325 Calculates monthly incident totals from provided dataset of historical data stratified by callsign 1326 """ 1327 1328 # Multiple resources can be sent to the same job. 1329 monthly_df = self.df[['inc_date', 'first_day_of_month', 'callsign']].dropna() 1330 1331 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'callsign']).count().reset_index() 1332 1333 #print(monthly_totals_df.head()) 1334 1335 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1336 1337 #print(monthly_totals_pivot_df.head()) 1338 1339 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_callsign.csv', mode="w+", index=False) 1340 1341 def historical_monthly_totals_by_hour_of_day(self): 1342 """ 1343 Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day 1344 """ 1345 1346 # Multiple resources can be sent to the same job. 1347 monthly_df = self.df[['inc_date', 'first_day_of_month', 'hour']].dropna()\ 1348 .drop_duplicates(subset="inc_date", keep="first") 1349 1350 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'hour']).count().reset_index() 1351 1352 #print(monthly_totals_df.head()) 1353 1354 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='hour', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1355 1356 #print(monthly_totals_pivot_df.head()) 1357 1358 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_hour_of_day.csv', mode="w+", index=False) 1359 1360 def historical_monthly_totals_by_day_of_week(self): 1361 """ 1362 Calculates number of incidents per month stratified by day of the week 1363 """ 1364 1365 # Multiple resources can be sent to the same job. 1366 monthly_df = self.df[['inc_date', 'first_day_of_month', 'day_of_week']].dropna()\ 1367 .drop_duplicates(subset="inc_date", keep="first") 1368 1369 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'day_of_week']).count().reset_index() 1370 1371 #print(monthly_totals_df.head()) 1372 1373 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='day_of_week', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1374 1375 #print(monthly_totals_pivot_df.head()) 1376 1377 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_day_of_week.csv', mode="w+", index=False) 1378 1379 def historical_median_time_of_activities_by_month_and_resource_type(self): 1380 """ 1381 Calculate the median time for each of the job cycle phases stratified by month and vehicle type 1382 """ 1383 1384 median_df = self.df[['first_day_of_month', 'time_allocation', 1385 'time_mobile', 'time_to_scene', 'time_on_scene', 1386 'time_to_hospital', 'time_to_clear', 'vehicle_type']].copy() 1387 1388 median_df['total_job_time'] = median_df[[ 1389 'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 1390 'time_to_hospital', 'time_to_clear']].sum(axis=1, skipna=True) 1391 1392 # Replacing zeros with NaN to exclude from median calculation 1393 # since if an HEMS result is Stood down en route, then time_on_scene would be zero and affect the median 1394 # median_df.replace(0, np.nan, inplace=True) 1395 1396 # Grouping by month and resource_type, calculating medians 1397 median_times = median_df.groupby(['first_day_of_month', 'vehicle_type']).median(numeric_only=True).reset_index() 1398 1399 pivot_data = median_times.pivot_table( 1400 index='first_day_of_month', 1401 columns='vehicle_type', 1402 values=['time_allocation', 'time_mobile', 'time_to_scene', 1403 'time_on_scene', 'time_to_hospital', 'time_to_clear', 'total_job_time'] 1404 ) 1405 1406 pivot_data.columns = [f"median_{col[1]}_{col[0]}" for col in pivot_data.columns] 1407 pivot_data = pivot_data.reset_index() 1408 1409 pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False) 1410 1411 def historical_monthly_resource_utilisation(self): 1412 """ 1413 Calculates number of, and time spent on, incidents per month stratified by callsign 1414 """ 1415 1416 # Multiple resources can be sent to the same job. 1417 monthly_df = self.df[[ 1418 'inc_date', 'first_day_of_month', 'callsign', 'time_allocation', 1419 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 1420 'time_to_clear']].copy() 1421 1422 monthly_df['total_time'] = monthly_df.filter(regex=r'^time_').sum(axis=1) 1423 1424 monthly_totals_df = monthly_df.groupby(['callsign', 'first_day_of_month'], as_index=False)\ 1425 .agg(n = ('callsign', 'size'), total_time = ('total_time', 'sum')) 1426 1427 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values=['n', 'total_time']) 1428 1429 monthly_totals_pivot_df.columns = [f"{col[0]}_{col[1]}" for col in monthly_totals_pivot_df.columns] 1430 monthly_totals_pivot_df = monthly_totals_pivot_df.reset_index() 1431 1432 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_resource_utilisation.csv', mode="w+", index=False) 1433 1434 def historical_daily_calls_breakdown(self): 1435 1436 df = self.df 1437 # Convert inc_date to date only (remove time) 1438 df['date'] = pd.to_datetime(df['inc_date']).dt.date 1439 1440 # Count number of calls per day 1441 calls_in_day_breakdown = df.groupby('date').size().reset_index(name='calls_in_day') 1442 1443 # Save the daily call counts with a 'day' index column 1444 calls_in_day_breakdown_with_day = calls_in_day_breakdown.copy() 1445 calls_in_day_breakdown_with_day.insert(0, 'day', range(1, len(calls_in_day_breakdown) + 1)) 1446 calls_in_day_breakdown_with_day.drop(columns='date').to_csv('historical_data/historical_daily_calls_breakdown.csv', index=False) 1447 1448 # Count how many days had the same number of calls 1449 calls_per_day_summary = calls_in_day_breakdown['calls_in_day'].value_counts().reset_index() 1450 calls_per_day_summary.columns = ['calls_in_day', 'days'] 1451 calls_per_day_summary.to_csv('historical_data/historical_daily_calls.csv', index=False) 1452 1453 def historical_missed_jobs(self): 1454 df = self.df 1455 df["date"] = pd.to_datetime(df["inc_date"]) 1456 df["hour"] = df["date"].dt.hour 1457 df["month_start"] = df["date"].dt.strftime("%Y-%m-01") 1458 df["callsign_group_simplified"] = df["callsign_group"].apply(lambda x: "No HEMS available" if x=="Other" else "HEMS (helo or car) available and sent") 1459 df["quarter"] = df["inc_date"].dt.quarter 1460 1461 # By month 1462 count_df_month = df[["callsign_group_simplified", "month_start"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','month_start']) 1463 count_df_month.to_csv("historical_data/historical_missed_calls_by_month.csv", index=False) 1464 1465 # By hour 1466 count_df = df[["callsign_group_simplified", "hour"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','hour']) 1467 count_df.to_csv("historical_data/historical_missed_calls_by_hour.csv", index=False) 1468 1469 # By quarter and hour 1470 count_df_quarter = df[["callsign_group_simplified", "quarter", "hour"]].value_counts().reset_index(name="count").sort_values(['quarter','callsign_group_simplified','hour']) 1471 count_df_quarter.to_csv("historical_data/historical_missed_calls_by_quarter_and_hour.csv", index=False) 1472 1473 def upper_allowable_time_bounds(self): 1474 """ 1475 Calculates the maximum permissable time for each phase on an incident based on supplied historical data. 1476 This is currently set to 1.5x the upper quartile of the data distribution 1477 """ 1478 1479 median_df = self.df[[ 1480 'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 1481 'time_to_hospital', 'time_to_clear', 'vehicle_type']] 1482 1483 # Replacing zeros with NaN to exclude from median calculation 1484 # since if an HEMS result is Stood down en route, then time_on_scene 1485 # would be zero and affect the median 1486 median_df.replace(0, np.nan, inplace=True) 1487 1488 print(median_df.quantile(.75)) 1489 # pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False) 1490 1491 def historical_job_durations_breakdown(self): 1492 1493 df = self.df 1494 1495 cols = [ 1496 'callsign', 'vehicle_type', 1497 'time_allocation', 'time_mobile', 1498 'time_to_scene', 'time_on_scene', 1499 'time_to_hospital', 'time_to_clear' 1500 ] 1501 df2 = df[cols].copy() 1502 1503 # 2. Add a 1-based row identifier 1504 df2['job_identifier'] = range(1, len(df2) + 1) 1505 1506 # 3. Compute total_duration as the row-wise sum of the time columns 1507 time_cols = [ 1508 'time_allocation', 'time_mobile', 1509 'time_to_scene', 'time_on_scene', 1510 'time_to_hospital', 'time_to_clear' 1511 ] 1512 df2['total_duration'] = df2[time_cols].sum(axis=1, skipna=True) 1513 1514 #print(df2.head()) 1515 1516 # 4. Pivot (melt) to long format 1517 df_long = df2.melt( 1518 id_vars=['job_identifier', 'callsign', 'vehicle_type'], 1519 value_vars=time_cols + ['total_duration'], 1520 var_name='name', 1521 value_name='value' 1522 ) 1523 1524 #print(df_long[df_long.job_identifier == 1]) 1525 1526 # 5. Drop any rows where callsign or vehicle_type is missing 1527 df_long = df_long.dropna(subset=['callsign', 'vehicle_type']) 1528 df_long_sorted = df_long.sort_values("job_identifier").reset_index(drop=True) 1529 1530 # 6. Write out to CSV 1531 df_long_sorted.to_csv("historical_data/historical_job_durations_breakdown.csv", index=False) 1532 1533 # ========== ARCHIVED CODE - v1 of calcualte_availability_row ======================== # 1534 # def calculate_availability_row(self, row, rota_df, callsign_lookup_df, period_start, period_end): 1535 # """ 1536 # Compute downtime overlap, rota-based scheduled time, and proportion for a given row. 1537 # Returns data tagged with bin, quarter, and downtime reason. 1538 # """ 1539 1540 # registration = row['aircraft'].lower() 1541 # downtime_start = pd.to_datetime(row['offline'], utc=True) 1542 # downtime_end = pd.to_datetime(row['online'], utc=True) 1543 # reason = row.get('reason', None) 1544 1545 # hour = downtime_start.hour 1546 # if 0 <= hour <= 5: 1547 # six_hour_bin = '00-05' 1548 # elif 6 <= hour <= 11: 1549 # six_hour_bin = '06-11' 1550 # elif 12 <= hour <= 17: 1551 # six_hour_bin = '12-17' 1552 # else: 1553 # six_hour_bin = '18-23' 1554 1555 # quarter = downtime_start.quarter 1556 1557 # # Match callsign 1558 # match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration] 1559 # if match.empty: 1560 # return { 1561 # 'registration': registration, 1562 # 'offline': downtime_start, 1563 # 'online': downtime_end, 1564 # 'six_hour_bin': six_hour_bin, 1565 # 'quarter': quarter, 1566 # 'total_offline': None, 1567 # 'scheduled_minutes': None, 1568 # 'reason': reason, 1569 # 'proportion': None 1570 # } 1571 # callsign = match.iloc[0]['callsign'] 1572 1573 # rota_rows = rota_df[rota_df['callsign'] == callsign] 1574 # if rota_rows.empty: 1575 # return { 1576 # 'registration': registration, 1577 # 'offline': downtime_start, 1578 # 'online': downtime_end, 1579 # 'six_hour_bin': six_hour_bin, 1580 # 'quarter': quarter, 1581 # 'total_offline': None, 1582 # 'scheduled_minutes': None, 1583 # 'reason': reason, 1584 # 'proportion': None 1585 # } 1586 1587 # # Clip evaluation window to downtime period 1588 # eval_start = max(downtime_start.normalize(), pd.to_datetime(period_start, utc=True)) 1589 # eval_end = min(downtime_end.normalize(), pd.to_datetime(period_end, utc=True)) 1590 1591 # total_scheduled_minutes = 0 1592 # total_overlap_minutes = 0 1593 1594 # current_day = eval_start 1595 # while current_day <= eval_end: 1596 # month = current_day.month 1597 # season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter' 1598 1599 # for _, rota in rota_rows.iterrows(): 1600 # start_hour = rota[f'{season}_start'] 1601 # end_hour = rota[f'{season}_end'] 1602 1603 # rota_start = current_day + timedelta(hours=start_hour) 1604 # rota_end = current_day + timedelta(hours=end_hour) 1605 # if end_hour <= start_hour: 1606 # rota_end += timedelta(days=1) 1607 1608 # # Count scheduled time regardless of overlap 1609 # scheduled_minutes = (rota_end - rota_start).total_seconds() / 60 1610 # total_scheduled_minutes += scheduled_minutes 1611 1612 # # Count overlap only if intersecting with downtime 1613 # overlap_start = max(downtime_start, rota_start) 1614 # overlap_end = min(downtime_end, rota_end) 1615 # if overlap_end > overlap_start: 1616 # overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60 1617 # total_overlap_minutes += overlap_minutes 1618 1619 # current_day += timedelta(days=1) 1620 1621 # if total_scheduled_minutes == 0: 1622 # proportion = None 1623 # else: 1624 # proportion = total_overlap_minutes / total_scheduled_minutes 1625 1626 # return { 1627 # 'registration': registration, 1628 # 'offline': downtime_start, 1629 # 'online': downtime_end, 1630 # 'six_hour_bin': six_hour_bin, 1631 # 'quarter': quarter, 1632 # 'total_offline': total_overlap_minutes, 1633 # 'scheduled_minutes': total_scheduled_minutes, 1634 # 'reason': reason, 1635 # 'proportion': proportion 1636 # } 1637 1638 1639 # ============= ARCHIVED CODE - v2 of calculate_availability_row ======================= # 1640 # def calculate_availability_row(self, row, rota_df, callsign_lookup_df): 1641 # """ 1642 # Compute downtime overlap, rota-based scheduled time, and proportion for a given row. 1643 # Returns data tagged with bin, quarter, and downtime reason. 1644 # """ 1645 1646 # registration = row['aircraft'].lower() 1647 # downtime_start = pd.to_datetime(row['offline'], utc=True) 1648 # downtime_end = pd.to_datetime(row['online'], utc=True) 1649 # reason = row.get('reason', None) 1650 1651 # hour = downtime_start.hour 1652 # if 0 <= hour <= 5: 1653 # six_hour_bin = '00-05' 1654 # elif 6 <= hour <= 11: 1655 # six_hour_bin = '06-11' 1656 # elif 12 <= hour <= 17: 1657 # six_hour_bin = '12-17' 1658 # else: 1659 # six_hour_bin = '18-23' 1660 1661 # quarter = downtime_start.quarter 1662 1663 # # Match callsign 1664 # match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration] 1665 # if match.empty: 1666 # return { 1667 # 'registration': registration, 1668 # 'offline': downtime_start, 1669 # 'online': downtime_end, 1670 # 'six_hour_bin': six_hour_bin, 1671 # 'quarter': quarter, 1672 # 'total_offline': None, 1673 # 'scheduled_minutes': None, 1674 # 'reason': reason, 1675 # 'proportion': None 1676 # } 1677 # callsign = match.iloc[0]['callsign'] 1678 1679 # rota_rows = rota_df[rota_df['callsign'] == callsign] 1680 # if rota_rows.empty: 1681 # return { 1682 # 'registration': registration, 1683 # 'offline': downtime_start, 1684 # 'online': downtime_end, 1685 # 'six_hour_bin': six_hour_bin, 1686 # 'quarter': quarter, 1687 # 'total_offline': None, 1688 # 'scheduled_minutes': None, 1689 # 'reason': reason, 1690 # 'proportion': None 1691 # } 1692 1693 # month = downtime_start.month 1694 # season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter' 1695 1696 # total_scheduled_minutes = 0 1697 # total_overlap_minutes = 0 1698 1699 # for _, rota in rota_rows.iterrows(): 1700 # start_hour = rota[f'{season}_start'] 1701 # end_hour = rota[f'{season}_end'] 1702 1703 # for base_day in [downtime_start.normalize() - timedelta(days=1), 1704 # downtime_start.normalize(), 1705 # downtime_start.normalize() + timedelta(days=1)]: 1706 1707 # rota_start = base_day + timedelta(hours=start_hour) 1708 # rota_end = base_day + timedelta(hours=end_hour) 1709 # if end_hour <= start_hour: 1710 # rota_end += timedelta(days=1) 1711 1712 # overlap_start = max(downtime_start, rota_start) 1713 # overlap_end = min(downtime_end, rota_end) 1714 1715 # if overlap_end > overlap_start: 1716 # scheduled_minutes = (rota_end - rota_start).total_seconds() / 60 1717 # overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60 1718 1719 # total_scheduled_minutes += scheduled_minutes 1720 # total_overlap_minutes += overlap_minutes 1721 1722 # if total_scheduled_minutes == 0: 1723 # proportion = None 1724 # else: 1725 # proportion = total_overlap_minutes / total_scheduled_minutes 1726 1727 # return { 1728 # 'registration': registration, 1729 # 'offline': downtime_start, 1730 # 'online': downtime_end, 1731 # 'six_hour_bin': six_hour_bin, 1732 # 'quarter': quarter, 1733 # 'total_offline': total_overlap_minutes, 1734 # 'scheduled_minutes': total_scheduled_minutes, 1735 # 'reason': reason, 1736 # 'proportion': proportion 1737 # } 1738 1739 # ================ ARCHIVED CODE - ad-hoc unavailability calculation ================ # 1740 # def ad_hoc_unavailability(self, period_start, period_end, include_debugging_cols=False): 1741 # """Process ad hoc unavailability records into a stratified probability table. 1742 1743 # Calculates the probability of ad hoc unavailability and availability based 1744 # on historical data. The data is stratified by aircraft registration, 1745 # six-hour time bins, and calendar quarters. It ensures all standard 1746 # reasons ('available', 'crew', 'weather', 'aircraft') are present for 1747 # each combination. It filters out any registration/quarter/bin pairings 1748 # with little scheduled time, marking their probabilities as blank. It also adds 1749 # a count of the ad-hoc unavailability events considered for each probability calculation 1750 # and the total scheduled time for the resource in that quarter/time period that was part of 1751 # the calculations. 1752 1753 # Returns 1754 # ------- 1755 # None 1756 # This function does not return a value but saves the calculated 1757 # probabilities to 'distribution_data/ad_hoc_unavailability.csv'. 1758 # The CSV file includes columns for registration, six_hour_bin, 1759 # quarter, reason, probability, and count. 1760 1761 # Notes 1762 # ----- 1763 # This function relies on the following input files: 1764 # - 'external_data/ad_hoc.csv': Contains ad hoc unavailability records. 1765 # - 'actual_data/HEMS_ROTA.csv': Contains rota information. 1766 # - 'actual_data/callsign_registration_lookup.csv': Maps callsigns to registrations. 1767 # It also depends on the 'calculate_availability_row' method within the 1768 # same class. Ensure that the 'distribution_data' directory exists. 1769 # """ 1770 # try: 1771 # # Load data 1772 # adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online']) 1773 # adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']] 1774 # rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv") 1775 # callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv") 1776 1777 # # Process each ad hoc record 1778 # results = adhoc_df.apply( 1779 # lambda row: self.calculate_availability_row(row, rota_df, callsign_lookup_df, period_start, period_end), 1780 # axis=1 1781 # ) 1782 # final_df = pd.DataFrame(results.tolist()) 1783 # final_df.to_csv("external_data/ad_hoc_intermediate.csv") 1784 1785 # # Check if final_df is empty before proceeding 1786 # if final_df.empty: 1787 # print("No ad-hoc data processed, skipping file generation.") 1788 # return 1789 1790 # # Define the full set of reasons expected 1791 # all_reasons = ['available', 'crew', 'weather', 'aircraft'] 1792 1793 # # --- Aggregate Data --- 1794 # # Calculate job count per registration, quarter, AND bin 1795 # unavailability_instance_counts = final_df.groupby(['registration', 'quarter', 'six_hour_bin']).size().reset_index(name='unavailability_instance_counts') 1796 1797 # # Downtime by bin + quarter + reason (only for unavailability reasons) 1798 # grouped = final_df[final_df['reason'].isin(['crew', 'weather', 'aircraft'])] 1799 # grouped = grouped.groupby(['registration', 'six_hour_bin', 'quarter', 'reason'])['total_offline'].sum().reset_index() 1800 1801 # # Scheduled time by bin + quarter 1802 # scheduled_totals = final_df.groupby(['registration', 'six_hour_bin', 'quarter'])['scheduled_minutes'].sum().reset_index() 1803 # scheduled_totals = scheduled_totals.rename(columns={'scheduled_minutes': 'total_scheduled'}) 1804 1805 # # Merge job counts into scheduled totals 1806 # scheduled_totals = pd.merge(scheduled_totals, unavailability_instance_counts, on=['registration', 'quarter', 'six_hour_bin'], how='left') 1807 1808 # # Calculate total downtime per bin + quarter (for 'available' calculation) 1809 # downtime_totals = grouped.groupby(['registration','six_hour_bin', 'quarter'])['total_offline'].sum().reset_index() 1810 # downtime_totals = downtime_totals.rename(columns={'total_offline': 'total_downtime'}) 1811 1812 # # --- Create Full Grid --- 1813 # # Get all unique combinations of registration, quarter, and bin 1814 # unique_bins = scheduled_totals[['registration', 'quarter', 'six_hour_bin']].drop_duplicates() 1815 1816 # # Check for empty unique_bins 1817 # if unique_bins.empty: 1818 # print("No valid unique bins found, skipping file generation.") 1819 # return 1820 1821 # # Create the full grid by crossing unique bins with all reasons 1822 # full_grid = unique_bins.assign(key=1).merge(pd.DataFrame({'reason': all_reasons, 'key': 1}), on='key').drop('key', axis=1) 1823 1824 # # --- Merge Data into Full Grid --- 1825 # full_grid = pd.merge(full_grid, scheduled_totals, on=['registration', 'quarter', 'six_hour_bin'], how='left') 1826 # full_grid = pd.merge(full_grid, grouped, on=['registration', 'six_hour_bin', 'quarter', 'reason'], how='left') 1827 # full_grid = pd.merge(full_grid, downtime_totals, on=['registration', 'six_hour_bin', 'quarter'], how='left') 1828 1829 # # Fill NaNs created during merges 1830 # full_grid['total_offline'] = full_grid['total_offline'].fillna(0) 1831 # full_grid['total_downtime'] = full_grid['total_downtime'].fillna(0) 1832 # full_grid['unavailability_instance_counts'] = full_grid['unavailability_instance_counts'].fillna(0) # Fill job count with 0 for bins that might exist but have no jobs 1833 1834 # # --- Calculate Probabilities --- 1835 # # Suppress division by zero warnings - we handle these next 1836 # with np.errstate(divide='ignore', invalid='ignore'): 1837 # prob_avail = (full_grid['total_scheduled'] - full_grid['total_downtime']) / full_grid['total_scheduled'] 1838 # prob_unavail = full_grid['total_offline'] / full_grid['total_scheduled'] 1839 1840 # full_grid['probability'] = np.where( 1841 # full_grid['reason'] == 'available', 1842 # prob_avail, 1843 # prob_unavail 1844 # ) 1845 1846 # # Handle NaN/Inf from division by zero, set them to 0.0 for now. 1847 # full_grid['probability'] = full_grid['probability'].replace([np.inf, -np.inf], np.nan).fillna(0.0) 1848 1849 # # --- Apply Threshold and Blanking --- 1850 # # Condition for setting probability to blank 1851 # condition_for_blank = (full_grid['total_scheduled'] < 60 * 2 * 30 * 3 ) # If less than 2 hours per day in time period rotad, exclude 1852 1853 # # Convert probability to object to allow blanks, then apply the condition 1854 # full_grid['probability'] = full_grid['probability'].astype(object) 1855 # full_grid.loc[condition_for_blank, 'probability'] = '' 1856 1857 # # --- Finalize and Save --- 1858 # # Select and rename columns 1859 # if include_debugging_cols: 1860 # final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability', 'unavailability_instance_counts', 'total_offline', 'total_scheduled']] 1861 # final_prob_df.rename(columns={"total_offine":"total_minutes_offline_for_reason", "total_scheduled": "total_minutes_rotad_availability_in_quarter_and_time_period"}) 1862 # else: 1863 # final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability']] 1864 # # final_prob_df = final_prob_df.rename(columns={'job_count': 'count'}) 1865 1866 # # Sort and save 1867 # final_prob_df = final_prob_df.sort_values(by=['registration', 'quarter', 'six_hour_bin', 'reason']).reset_index(drop=True) 1868 # final_prob_df.to_csv("distribution_data/ad_hoc_unavailability.csv", index=False) 1869 1870 # print("Ad-hoc unavailability probability table generated successfully.") 1871 1872 # except FileNotFoundError: 1873 # print("Couldn't generate ad-hoc unavailability due to missing file(s). " 1874 # "Please ensure 'external_data/ad_hoc.csv', " 1875 # "'actual_data/HEMS_ROTA.csv', and " 1876 # "'actual_data/callsign_registration_lookup.csv' exist.") 1877 # pass 1878 # except Exception as e: 1879 # print(f"An error occurred during ad-hoc unavailability processing: {e}") 1880 # pass 1881 1882 def ad_hoc_unavailability(self, period_start, period_end): 1883 """ 1884 Calculate aircraft availability and unavailability probabilities based on scheduled rotas and ad-hoc downtime. 1885 1886 Args: 1887 period_start (str/datetime): Start date for analysis period 1888 period_end (str/datetime): End date for analysis period 1889 include_debugging_cols (bool): Whether to include debugging columns in output (currently unused) 1890 1891 Returns: 1892 None (saves results to CSV file) 1893 """ 1894 # Load and prepare ad-hoc downtime data 1895 adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online']) 1896 adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']] 1897 # Load rota and callsign lookup data 1898 rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv") 1899 callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv") 1900 # Merge rota with callsign lookup to get registration numbers to allow matching with 1901 # ad-hoc data, which uses registrations 1902 full_rota_df = rota_df.merge(callsign_lookup_df, on="callsign") 1903 1904 # Define the hour bands mapping 1905 HOUR_BANDS = { 1906 '00-05': (0, 6), # 00:00 to 05:59 1907 '06-11': (6, 12), # 06:00 to 11:59 1908 '12-17': (12, 18), # 12:00 to 17:59 1909 '18-23': (18, 24) # 18:00 to 23:59 1910 } 1911 1912 # Create list of 6-hour bins 1913 bins = ['00-05', '06-11', '12-17', '18-23'] 1914 1915 def is_summer(date_obj, summer_start_month=4, summer_end_month=9): 1916 """ 1917 Determine if a date falls in summer months (April-September). 1918 1919 Args: 1920 date_obj: Date object to check 1921 1922 Returns: 1923 bool: True if date is in summer months 1924 """ 1925 return date_obj.month in [i for i in range(summer_start_month, summer_end_month+1)] 1926 1927 def check_month_is_summer(month, summer_start_month=4, summer_end_month=9): 1928 """ 1929 Determine if a date falls in summer months (April-September). 1930 1931 Args: 1932 month: Integer month 1933 1934 Returns: 1935 str: 'summer' is in summer months, 'winter' if in winter months 1936 """ 1937 return 'summer' if month in [i for i in range(summer_start_month, summer_end_month+1)] else 'winter' 1938 1939 def get_band_for_hour(hour): 1940 """ 1941 Return the 6-hour band name for a given hour (0-23). 1942 1943 Args: 1944 hour (int): Hour of day (0-23) 1945 1946 Returns: 1947 str or None: Band name or None if hour is invalid 1948 """ 1949 for band_name, (start, end) in HOUR_BANDS.items(): 1950 if start <= hour < end: 1951 return band_name 1952 return None 1953 1954 def calculate_minutes_in_band(shift_start, shift_end, band_start, band_end): 1955 """ 1956 Calculate how many minutes of a shift fall within a specific time band. 1957 1958 Args: 1959 shift_start (float): Shift start time in hours (0-23.99) 1960 shift_end (float): Shift end time in hours (0-23.99) 1961 band_start (int): Band start hour 1962 band_end (int): Band end hour 1963 1964 Returns: 1965 int: Minutes of overlap between shift and band 1966 """ 1967 # Find the overlap between shift and band 1968 overlap_start = max(shift_start, band_start) 1969 overlap_end = min(shift_end, band_end) 1970 1971 if overlap_start < overlap_end: 1972 return int((overlap_end - overlap_start) * 60) # Convert to minutes 1973 return 0 1974 1975 # Create date range for analysis period 1976 date_range = pd.date_range(start=period_start, 1977 end=period_end, 1978 freq='D') 1979 daily_df = pd.DataFrame({'date': date_range}) 1980 daily_df['date'] = pd.to_datetime(daily_df['date']) 1981 1982 # Create MultiIndex from date and bins combinations to get all date/time combinations 1983 multi_index = pd.MultiIndex.from_product( 1984 [daily_df['date'], bins], 1985 names=['date', 'six_hour_bin'] 1986 ) 1987 1988 # Convert MultiIndex to DataFrame and reset index 1989 daily_df = multi_index.to_frame(index=False).reset_index(drop=True) 1990 # Add quarter information for seasonal analysis 1991 daily_df["quarter"] = daily_df.date.dt.quarter 1992 1993 # Initialize availability columns for each aircraft registration 1994 # Each column will store minutes of scheduled time per time band 1995 for registration in full_rota_df['registration'].unique(): 1996 daily_df[registration] = 0 # Initialize with 0 minutes 1997 1998 # Calculate scheduled availability for each date/time band combination 1999 for _, row in daily_df.iterrows(): 2000 current_date = row['date'] 2001 current_band = row['six_hour_bin'] 2002 band_start, band_end = HOUR_BANDS[current_band] 2003 2004 is_current_date_summer = is_summer(current_date) 2005 2006 # Get the row index for updating the dataframe 2007 row_idx = daily_df[(daily_df['date'] == current_date) & 2008 (daily_df['six_hour_bin'] == current_band)].index[0] 2009 2010 # Iterate through each resource's rota entry 2011 for _, rota_entry in full_rota_df.iterrows(): 2012 registration = rota_entry['registration'] 2013 # Select appropriate start/end times based on season 2014 start_hour_col = 'summer_start' if is_current_date_summer else 'winter_start' 2015 end_hour_col = 'summer_end' if is_current_date_summer else 'winter_end' 2016 start_hour = rota_entry[start_hour_col] 2017 end_hour = rota_entry[end_hour_col] 2018 2019 total_minutes_for_band = 0 2020 2021 if start_hour < end_hour: 2022 # Shift within same day 2023 total_minutes_for_band = calculate_minutes_in_band( 2024 start_hour, end_hour, band_start, band_end 2025 ) 2026 elif start_hour > end_hour: 2027 # Shift spans midnight - check both parts 2028 2029 # Part 1: Today from start_hour to midnight 2030 if band_end <= 24: # This band is today 2031 total_minutes_for_band += calculate_minutes_in_band( 2032 start_hour, 24, band_start, band_end 2033 ) 2034 2035 # Part 2: Tomorrow from midnight to end_hour 2036 # Need to check if this band is for tomorrow 2037 tomorrow = current_date + pd.Timedelta(days=1) 2038 tomorrow_rows = daily_df[daily_df['date'] == tomorrow] 2039 2040 if not tomorrow_rows.empty and current_band in tomorrow_rows['six_hour_bin'].values: 2041 total_minutes_for_band += calculate_minutes_in_band( 2042 0, end_hour, band_start, band_end 2043 ) 2044 2045 # Update the scheduled time for this aircraft in this time band 2046 daily_df.loc[row_idx, registration] += total_minutes_for_band 2047 2048 # Aggregate scheduled availability by quarter, time band, and registration 2049 available_time = ( 2050 daily_df.melt(id_vars=["date", "six_hour_bin", "quarter"], value_name="rota_time", var_name="registration") 2051 .groupby(["quarter", "six_hour_bin", "registration"])[["rota_time"]] 2052 .sum().reset_index() 2053 ) 2054 2055 def calculate_availability_row(row, rota_df, callsign_lookup_df): 2056 """ 2057 Calculate downtime overlap with scheduled rota for a single downtime event. 2058 2059 Args: 2060 row: DataFrame row containing downtime information 2061 rota_df: DataFrame with rota schedules 2062 callsign_lookup_df: DataFrame mapping callsigns to registrations 2063 2064 Returns: 2065 dict: Dictionary containing processed availability data 2066 """ 2067 # Extract downtime information 2068 registration = row['aircraft'].lower() 2069 downtime_start = pd.to_datetime(row['offline'], utc=True) 2070 downtime_end = pd.to_datetime(row['online'], utc=True) 2071 reason = row.get('reason', None) 2072 2073 # Determine which 6-hour bin this downtime starts in 2074 hour = downtime_start.hour 2075 if 0 <= hour <= 5: 2076 six_hour_bin = '00-05' 2077 elif 6 <= hour <= 11: 2078 six_hour_bin = '06-11' 2079 elif 12 <= hour <= 17: 2080 six_hour_bin = '12-17' 2081 else: 2082 six_hour_bin = '18-23' 2083 2084 quarter = downtime_start.quarter 2085 2086 # Find the callsign for this registration 2087 match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration] 2088 2089 # No matching callsign found 2090 if match.empty: 2091 return { 2092 'registration': registration, 2093 'offline': downtime_start, 2094 'online': downtime_end, 2095 'six_hour_bin': six_hour_bin, 2096 'quarter': quarter, 2097 'total_offline': None, 2098 'reason': reason 2099 } 2100 2101 callsign = match.iloc[0]['callsign'] 2102 # Find rota entries for this callsign 2103 rota_rows = rota_df[rota_df['callsign'] == callsign] 2104 if rota_rows.empty: 2105 return { 2106 'registration': registration, 2107 'offline': downtime_start, 2108 'online': downtime_end, 2109 'six_hour_bin': six_hour_bin, 2110 'quarter': quarter, 2111 'total_offline': None, 2112 'reason': reason 2113 } 2114 2115 # Determine season for appropriate rota times 2116 month = downtime_start.month 2117 season = check_month_is_summer(month) 2118 2119 total_overlap_minutes = 0 2120 2121 # Calculate overlap between downtime and scheduled rota times 2122 for _, rota in rota_rows.iterrows(): 2123 start_hour = rota[f'{season}_start'] 2124 end_hour = rota[f'{season}_end'] 2125 2126 # Check overlap across multiple days (yesterday, today, tomorrow) 2127 # This handles shifts that span midnight 2128 for base_day in [downtime_start.normalize() - timedelta(days=1), 2129 downtime_start.normalize(), 2130 downtime_start.normalize() + timedelta(days=1)]: 2131 2132 rota_start = base_day + timedelta(hours=start_hour) 2133 rota_end = base_day + timedelta(hours=end_hour) 2134 2135 # Handle shifts that cross midnight 2136 if end_hour <= start_hour: 2137 rota_end += timedelta(days=1) 2138 # Calculate overlap between downtime and this rota period 2139 overlap_start = max(downtime_start, rota_start) 2140 overlap_end = min(downtime_end, rota_end) 2141 2142 if overlap_end > overlap_start: 2143 overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60 2144 2145 total_overlap_minutes += overlap_minutes 2146 2147 return { 2148 'registration': registration, 2149 'offline': downtime_start, 2150 'online': downtime_end, 2151 'six_hour_bin': six_hour_bin, 2152 'quarter': quarter, 2153 'total_offline': total_overlap_minutes, 2154 'reason': reason 2155 } 2156 2157 # Process all ad-hoc downtime events 2158 results = adhoc_df.apply( 2159 lambda row: calculate_availability_row( 2160 row, rota_df, callsign_lookup_df), 2161 axis=1 2162 ) 2163 2164 # Convert results to DataFrame and select relevant columns 2165 unavailability_minutes_df = ( 2166 pd.DataFrame(results.tolist()) 2167 [["registration", "six_hour_bin", "quarter", "total_offline", "reason"]] 2168 ) 2169 2170 # Aggregate offline time by registration, time band, quarter, and reason 2171 offline_durations = unavailability_minutes_df.groupby( 2172 ["registration", "six_hour_bin", "quarter", "reason"] 2173 )[["total_offline"]].sum().reset_index() 2174 2175 # Create complete combinations to ensure all possible categories are represented 2176 # This prevents missing data issues in the final output 2177 registrations = offline_durations['registration'].unique() 2178 six_hour_bins = offline_durations['six_hour_bin'].unique() 2179 quarters = offline_durations['quarter'].unique() 2180 reasons = offline_durations['reason'].unique() 2181 2182 # Generate all possible combinations 2183 all_combinations = list(itertools.product(registrations, six_hour_bins, quarters, reasons)) 2184 2185 # Create complete dataframe with all combinations 2186 complete_df = pd.DataFrame( 2187 all_combinations, 2188 columns=['registration', 'six_hour_bin', 'quarter', 'reason'] 2189 ) 2190 2191 # Merge with original data to preserve existing values, fill missing with 0 2192 offline_durations = complete_df.merge( 2193 offline_durations, 2194 on=['registration', 'six_hour_bin', 'quarter', 'reason'], 2195 how='left' 2196 ) 2197 2198 # Fill NaN values with 0 (no downtime for those combinations) 2199 offline_durations['total_offline'] = offline_durations['total_offline'].fillna(0.0) 2200 2201 # Sort for better readability 2202 offline_durations = offline_durations.sort_values( 2203 ['registration', 'six_hour_bin', 'quarter', 'reason'] 2204 ).reset_index(drop=True) 2205 2206 # Ensure consistent case for registration names 2207 available_time["registration"] = available_time["registration"].str.lower() 2208 offline_durations["registration"] = offline_durations["registration"].str.lower() 2209 2210 # Merge scheduled time with downtime data 2211 ad_hoc = available_time.merge(offline_durations, on=["registration", "quarter", "six_hour_bin"]) 2212 ad_hoc["probability"] = ad_hoc["total_offline"] / ad_hoc["rota_time"] 2213 2214 # Calculate availability probability (1 - sum of all unavailability probabilities) 2215 available_prop_df = ad_hoc.groupby( 2216 ["quarter", "six_hour_bin", "registration", "rota_time"] 2217 )[["probability"]].sum().reset_index() 2218 2219 available_prop_df["reason"] = "available" 2220 available_prop_df["probability"] = 1 - available_prop_df["probability"] 2221 2222 # Combine unavailability and availability data 2223 final_ad_hoc_df = ( 2224 pd.concat([ad_hoc, available_prop_df]) 2225 .sort_values(["quarter", "six_hour_bin", "registration", "reason"]) 2226 ) 2227 2228 # Handle cases where there's no scheduled time (set probability to NaN) 2229 final_ad_hoc_df["probability"] = final_ad_hoc_df.apply( 2230 lambda x: np.nan if x["rota_time"]==0 else x["probability"], 2231 axis=1 2232 ) 2233 2234 final_ad_hoc_df["probability"] = final_ad_hoc_df["probability"].round(5) 2235 2236 # Save results to CSV 2237 final_ad_hoc_df[["registration", "six_hour_bin", "quarter", "reason", "probability"]].to_csv( 2238 "distribution_data/ad_hoc_unavailability.csv", index=False 2239 ) 2240 2241 2242 def run_sim_on_historical_params(self): 2243 # Ensure all rotas are using default values 2244 rota = pd.read_csv("tests/rotas_historic/HISTORIC_HEMS_ROTA.csv") 2245 rota.to_csv("actual_data/HEMS_ROTA.csv", index=False) 2246 2247 callsign_reg_lookup = pd.read_csv("tests/rotas_historic/HISTORIC_callsign_registration_lookup.csv") 2248 callsign_reg_lookup.to_csv("actual_data/callsign_registration_lookup.csv", index=False) 2249 2250 service_history = pd.read_csv("tests/rotas_historic/HISTORIC_service_history.csv") 2251 service_history.to_csv("actual_data/service_history.csv", index=False) 2252 2253 service_sched = pd.read_csv("tests/rotas_historic/HISTORIC_service_schedules_by_model.csv") 2254 service_sched.to_csv("actual_data/service_schedules_by_model.csv", index=False) 2255 2256 print("Generating simulation results...") 2257 removeExistingResults() 2258 2259 total_runs = 30 2260 sim_years = 2 2261 sim_duration = 60 * 24 * 7 * 52 * sim_years 2262 2263 parallelProcessJoblib( 2264 total_runs=total_runs, 2265 sim_duration=sim_duration, 2266 warm_up_time=0, 2267 sim_start_date=datetime.strptime("2023-01-01 05:00:00", "%Y-%m-%d %H:%M:%S"), 2268 amb_data=False, 2269 print_debug_messages=True 2270 ) 2271 2272 collateRunResults() 2273 2274 try: 2275 results_all_runs = pd.read_csv("data/run_results.csv") 2276 # results_all_runs.to_csv("historical_data/calculated/SIM_hist_params.csv", index=False) 2277 2278 # save data of counts of suboptimal care category sent 2279 counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", 'hems_res_category', "care_cat"]].value_counts().reset_index() 2280 2281 counts_df_summary = counts_df.groupby(["hems_res_category", "care_cat"])["count"].agg(["mean", "min", "max"]).reset_index() 2282 2283 counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_care_cat_sent_summary.csv") 2284 2285 # save data of counts of suboptimal vehicle type sent 2286 counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", "vehicle_type", "heli_benefit"]].value_counts().reset_index() 2287 2288 counts_df_summary = counts_df.groupby(["vehicle_type", "heli_benefit"])["count"].agg(["mean", "min", "max"]).reset_index() 2289 2290 counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_vehicle_type_sent_summary.csv") 2291 2292 2293 # # Also run the model to get some base-case outputs 2294 # resource_requests = ( 2295 # results_all_runs[results_all_runs["event_type"] == "resource_request_outcome"] 2296 # .copy() 2297 # ) 2298 2299 # resource_requests["care_cat"] = ( 2300 # resource_requests.apply(lambda x: "REG - Helicopter Benefit" if x["heli_benefit"]=="y" 2301 # and x["care_cat"]=="REG" else x["care_cat"], 2302 # axis=1) 2303 # ) 2304 2305 # missed_jobs_care_cat_summary = ( 2306 # resource_requests[["care_cat", "time_type"]].value_counts().reset_index(name="jobs") 2307 # .sort_values(["care_cat", "time_type"]) 2308 # .copy() 2309 # ) 2310 2311 # missed_jobs_care_cat_summary["jobs_average"] = ( 2312 # missed_jobs_care_cat_summary["jobs"]/ 2313 # total_runs 2314 # ) 2315 2316 # missed_jobs_care_cat_summary["jobs_per_year_average"] = ( 2317 # (missed_jobs_care_cat_summary["jobs_average"] / float(sim_years*365)*365) 2318 # ).round(0) 2319 2320 missed_jobs_care_cat_summary = _job_outcome_calculation.get_missed_call_df( 2321 results_all_runs=results_all_runs, 2322 run_length_days = float(sim_years*365), 2323 what="summary" 2324 ) 2325 2326 missed_jobs_care_cat_summary.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_summary.csv") 2327 2328 2329 missed_jobs_care_cat_breakdown = _job_outcome_calculation.get_missed_call_df( 2330 results_all_runs=results_all_runs, 2331 run_length_days = float(sim_years*365), 2332 what="breakdown" 2333 ) 2334 2335 missed_jobs_care_cat_breakdown.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_breakdown.csv") 2336 2337 except FileNotFoundError: 2338 pass 2339 2340if __name__ == "__main__": 2341 from distribution_fit_utils import DistributionFitUtils 2342 test = DistributionFitUtils('external_data/clean_daa_import_missing_2023_2024.csv', 2343 calculate_school_holidays=True) 2344 #test = DistributionFitUtils('external_data/clean_daa_import.csv') 2345 test.import_and_wrangle() 2346 test.run_sim_on_historical_params() 2347 2348# Testing ---------- 2349# python distribution_fit_utils.py
22class DistributionFitUtils(): 23 """ 24 # The DistributionFitUtils classa 25 26 This class will import a CSV, undertake some light 27 wrangling and then determine distributions and probabilities required 28 for the Discrete Event Simulation 29 30 example usage: 31 my_data = DistributionFitUtils('data/my_data.csv') 32 my_data.import_and_wrangle() 33 34 """ 35 36 def __init__(self, file_path: str, calculate_school_holidays = False, school_holidays_years = 0): 37 38 self.file_path = file_path 39 self.df = pd.DataFrame() 40 41 # The number of additional years of school holidays 42 # that will be calculated over that maximum date in the provided dataset 43 self.school_holidays_years = school_holidays_years 44 self.calculate_school_holidays = calculate_school_holidays 45 46 self.times_to_fit = [ 47 {"hems_result": "Patient Treated but not conveyed by HEMS", 48 "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']}, 49 {"hems_result": "Patient Conveyed by HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']}, 50 {"hems_result": "Patient Conveyed by land with HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']}, 51 {"hems_result": "Stand Down" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_clear']}, 52 {"hems_result": "Landed but no patient contact" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']}, 53 ] 54 55 self.sim_tools_distr_plus = [ 56 "poisson", 57 "bernoulli", 58 "triang", 59 "erlang", 60 "weibull_min", 61 "expon_weib", 62 "betabinom", 63 "pearson3", 64 "cauchy", 65 "chi2", 66 "expon", 67 "exponpow", 68 "gamma", 69 "lognorm", 70 "norm", 71 "powerlaw", 72 "rayleigh", 73 "uniform", 74 "neg_binomial", 75 "zip" 76 ] 77 # SR 16-04-2025 Have hardcoded the common distributions 78 # to make setup for random number generation more robust 79 #+ get_common_distributions() 80 81 def removeExistingResults(self, folder: str) -> None: 82 """ 83 Removes results from previous fitting 84 """ 85 86 matching_files = glob.glob(os.path.join(folder, "*.*")) 87 88 print(matching_files) 89 90 for file in matching_files: 91 os.remove(file) 92 93 94 def getBestFit(self, q_times, distr=get_common_distributions(), show_summary=False): 95 """ 96 97 Convenience function for Fitter. 98 Returns model and parameters that is considered 99 the 'best fit'. 100 101 TODO: Determine how Fitter works this out 102 103 """ 104 105 if(q_times.size > 0): 106 if(len(distr) > 0): 107 f = Fitter(q_times, timeout=60, distributions=distr) 108 else: 109 f = Fitter(q_times, timeout=60) 110 f.fit() 111 if show_summary == True: 112 f.summary() 113 return f.get_best() 114 else: 115 return {} 116 117 118 def import_and_wrangle(self): 119 """ 120 121 Function to import CSV, add additional columns that are required 122 and then sequentially execute other class functions to generate 123 the probabilities and distributions required for the DES. 124 125 TODO: Additional logic is required to check the imported CSV 126 for missing values, incorrect columns names etc. 127 128 """ 129 130 try: 131 df = pd.read_csv(self.file_path, quoting=QUOTE_ALL) 132 self.df = df 133 134 # Perhaps run some kind of checking function here. 135 136 except FileNotFoundError: 137 print(f"Cannot locate that file") 138 139 # If everything is okay, crack on... 140 self.df['inc_date'] = pd.to_datetime(self.df['inc_date']) 141 self.df['date_only'] = pd.to_datetime(df['inc_date'].dt.date) 142 self.df['hour'] = self.df['inc_date'].dt.hour # Hour of the day 143 self.df['day_of_week'] = self.df['inc_date'].dt.day_name() # Day of the week (e.g., Monday) 144 self.df['month'] = self.df['inc_date'].dt.month 145 self.df['quarter'] = self.df['inc_date'].dt.quarter 146 self.df['first_day_of_month'] = self.df['inc_date'].to_numpy().astype('datetime64[M]') 147 148 # Replacing a upper quartile limit on job cycle times and 149 # instead using a manually specified time frame. 150 # This has the advantage of allowing manual amendment of the falues 151 # on the front-end 152 # self.max_values_df = self.upper_allowable_time_bounds() 153 self.min_max_values_df = pd.read_csv('actual_data/upper_allowable_time_bounds.csv') 154 #print(self.min_max_values_df) 155 156 #This will be needed for other datasets, but has already been computed for DAA 157 #self.df['ampds_card'] = self.df['ampds_code'].str[:2] 158 159 self.removeExistingResults(Utils.HISTORICAL_FOLDER) 160 self.removeExistingResults(Utils.DISTRIBUTION_FOLDER) 161 162 #get proportions of AMPDS card by hour of day 163 self.hour_by_ampds_card_probs() 164 165 # Determine 'best' distributions for time-based data 166 self.activity_time_distributions() 167 168 # Calculate probability patient will be female based on AMPDS card 169 self.sex_by_ampds_card_probs() 170 171 # Determine 'best' distributions for age ranges straitifed by AMPDS card 172 self.age_distributions() 173 174 # Alternative approach to IA times. Start with probabilty of call at given hour stratified by quarter 175 self.hourly_arrival_by_qtr_probs() 176 177 # Calculates the mean and standard deviation of the number of incidents per day stratified by quarter 178 self.incidents_per_day() 179 self.incidents_per_day_samples() 180 181 # Calculate probability of enhanced or critical care being required based on AMPDS card 182 self.enhanced_or_critical_care_by_ampds_card_probs() 183 184 # Calculate HEMS result 185 self.hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs() 186 187 # Calculate probability of callsign being allocated to a job based on AMPDS card and hour of day 188 # self.callsign_group_by_ampds_card_and_hour_probs() 189 # self.callsign_group_by_ampds_card_probs() 190 # self.callsign_group_by_care_category() 191 self.callsign_group() 192 193 # Calculate probability of a particular vehicle type based on callsign group and month of year 194 # self.vehicle_type_by_month_probs() 195 self.vehicle_type_by_quarter_probs() 196 # self.vehicle_type_probs() # Similar to previous but without monthly stratification since ad hoc unavailability should account for this. 197 198 # Calculate the patient outcome (conveyed, deceased, unknown) 199 self.patient_outcome_by_care_category_and_quarter_probs() 200 201 # ============= ARCHIVED CODE ================= # 202 # Calculate the mean inter-arrival times stratified by yearly quarter and hour of day 203 # self.inter_arrival_times() 204 # ============= END ARCHIVED CODE ================= # 205 206 207 # ============= ARCHIVED CODE ================= # 208 # Calculate probably of patient outcome 209 # Note - this still needs to be run to support another one? 210 # Calculate probability of a specific patient outcome being allocated to a job based on HEMS result and callsign 211 # self.pt_outcome_by_hems_result_and_care_category_probs() 212 # ============= END ARCHIVED CODE ================= # 213 214 # ============= ARCHIVED CODE ================= # 215 # self.hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs() 216 # ============= END ARCHIVED CODE ================= # 217 218 # ============= ARCHIVED CODE ================= # 219 # Calculate probabily of HEMS result being allocated to a job based on callsign and hour of day 220 # self.hems_result_by_callsign_group_and_vehicle_type_probs() 221 # ============= END ARCHIVED CODE ================= # 222 223 # ============= ARCHIVED CODE ================= # 224 # Calculate probability of HEMS result being allocated to a job based on care category and helicopter benefit 225 # self.hems_result_by_care_cat_and_helicopter_benefit_probs() 226 # ============= END ARCHIVED CODE ================= # 227 228 229 # Calculate school holidays since servicing schedules typically avoid these dates 230 if self.calculate_school_holidays: 231 self.school_holidays() 232 233 # Calculate historical data 234 self.historical_monthly_totals() 235 self.historical_monthly_totals_by_callsign() 236 self.historical_monthly_totals_by_day_of_week() 237 self.historical_median_time_of_activities_by_month_and_resource_type() 238 self.historical_monthly_totals_by_hour_of_day() 239 self.historical_monthly_resource_utilisation() 240 self.historical_monthly_totals_all_calls() 241 self.historical_daily_calls_breakdown() 242 self.historical_job_durations_breakdown() 243 self.historical_missed_jobs() 244 self.historical_jobs_per_day_per_callsign() 245 self.historical_care_cat_counts() 246 247 # Calculate proportions of ad hoc unavailability 248 try: 249 # self.ad_hoc_unavailability() 250 self.ad_hoc_unavailability(period_start="2022-08-01", period_end="2024-07-31") 251 except FileNotFoundError: 252 print("Couldn't find ad-hoc unavailability file") 253 254 def hour_by_ampds_card_probs(self): 255 """ 256 257 Calculates the proportions of calls that are triaged with 258 a specific AMPDS card. This is stratified by hour of day 259 260 TODO: Determine whether this should also be stratified by yearly quarter 261 262 """ 263 category_counts = self.df.groupby(['hour', 'ampds_card']).size().reset_index(name='count') 264 total_counts = category_counts.groupby('hour')['count'].transform('sum') 265 category_counts['proportion'] = round(category_counts['count'] / total_counts, 4) 266 267 #category_counts['ampds_card'] = category_counts['ampds_card'].apply(lambda x: str(x).zfill(2)) 268 269 category_counts.to_csv('distribution_data/hour_by_ampds_card_probs.csv', mode="w+") 270 271 272 def sex_by_ampds_card_probs(self): 273 """ 274 275 Calculates the probability that the patient will be female 276 stratified by AMPDS card. 277 278 """ 279 age_df = self.df 280 category_counts = age_df.groupby(['ampds_card', 'sex']).size().reset_index(name='count') 281 total_counts = category_counts.groupby('ampds_card')['count'].transform('sum') 282 category_counts['proportion'] = round(category_counts['count'] / total_counts, 3) 283 284 category_counts[category_counts['sex'] =='Female'].to_csv('distribution_data/sex_by_ampds_card_probs.csv', mode="w+") 285 286 def activity_time_distributions(self): 287 """ 288 289 Determine the 'best' distribution for each phase of a call 290 i.e. Allocation time, Mobilisation time, Time to scene 291 Time on scene, Travel time to hospital and handover, Time to clear. 292 Not all times will apply to all cases, so the class 'times_to_fit' 293 variable is a list of dictionaries, which contains the times to fit 294 295 The data is currently stratitied by HEMS_result and vehicle type fields. 296 297 """ 298 299 vehicle_type = self.df['vehicle_type'].dropna().unique() 300 301 # We'll need to make sure that where a distribution is missing that the time is set to 0 in the model. 302 # Probably easier than complicated logic to determine what times should be available based on hems_result 303 304 final_distr = [] 305 306 for row in self.times_to_fit: 307 #print(row) 308 for ttf in row['times_to_fit']: 309 for vt in vehicle_type: 310 #print(f"HEMS result is {row['hems_result']} times_to_fit is {ttf} and vehicle type is {vt}") 311 312 # This line might not be required if data quality is determined when importing the data 313 max_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].max_value_mins.iloc[0] 314 min_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].min_value_mins.iloc[0] 315 316 #print(f"Max time is {max_time} and Min time is {min_time}") 317 318 if ttf == 'time_on_scene': 319 # There is virtually no data for HEMS_result other than patient conveyed 320 # which is causing issues with fitting. For time on scene, will 321 # use a simplified fitting ignoring hems_result as a category 322 fit_times = self.df[ 323 (self.df.vehicle_type == vt) & 324 (self.df[ttf] >= min_time) & 325 (self.df[ttf] <= max_time) 326 ][ttf] 327 else: 328 fit_times = self.df[ 329 (self.df.vehicle_type == vt) & 330 (self.df[ttf] >= min_time) & 331 (self.df[ttf] <= max_time) & 332 (self.df.hems_result == row['hems_result']) 333 ][ttf] 334 #print(fit_times[:10]) 335 best_fit = self.getBestFit(fit_times, distr=self.sim_tools_distr_plus) 336 #print(best_fit) 337 338 return_dict = { "vehicle_type": vt, "time_type" : ttf, "best_fit": best_fit, "hems_result": row['hems_result'], "n": len(fit_times)} 339 #print(return_dict) 340 final_distr.append(return_dict) 341 342 with open('distribution_data/activity_time_distributions.txt', 'w+') as convert_file: 343 convert_file.write(json.dumps(final_distr)) 344 convert_file.close() 345 346 347 def age_distributions(self): 348 """ 349 350 Determine the 'best' distribution for age stratified by 351 AMPDS card 352 353 """ 354 355 age_distr = [] 356 357 age_df = self.df[["age", "ampds_card"]].dropna() 358 ampds_cards = age_df['ampds_card'].unique() 359 print(ampds_cards) 360 361 for card in ampds_cards: 362 fit_ages = age_df[age_df['ampds_card'] == card]['age'] 363 best_fit = self.getBestFit(fit_ages, distr=self.sim_tools_distr_plus) 364 return_dict = { "ampds_card": str(card), "best_fit": best_fit, "n": len(fit_ages)} 365 age_distr.append(return_dict) 366 367 with open('distribution_data/age_distributions.txt', 'w+') as convert_file: 368 convert_file.write(json.dumps(age_distr)) 369 convert_file.close() 370 371 372 # def inter_arrival_times(self): 373 # """ 374 375 # Calculate the mean inter-arrival times for patients 376 # stratified by hour, and and yearly quarter 377 378 # """ 379 380 # ia_df = self.df[['date_only', 'quarter', 'hour']].dropna() 381 382 # count_df = ia_df.groupby(['hour', 'date_only', 'quarter']).size().reset_index(name='n') 383 384 # ia_times_df = ( 385 # count_df.groupby(['hour', 'quarter']) 386 # .agg( 387 # # max_arrivals_per_hour=('n', lambda x: round(60 / np.max(x), 3)), 388 # # min_arrivals_per_hour=('n', lambda x: round(60 / np.min(x),3)), 389 # mean_cases=('n', lambda x: round(x.mean(), 1)), 390 # # sd_cases=('n', lambda x: round(x.std(), 3)), 391 # mean_iat=('n', lambda x: 60 / x.mean()) 392 # # n=('n', 'size') 393 # ) 394 # .reset_index() 395 # ) 396 # # Additional column for NSPPThinning 397 # ia_times_df['t'] = ia_times_df['hour'] 398 # ia_times_df['arrival_rate'] = ia_times_df['mean_iat'].apply(lambda x: 1/x) 399 400 # ia_times_df.to_csv('distribution_data/inter_arrival_times.csv', mode='w+') 401 402 403 def incidents_per_day(self): 404 """ 405 Fit distributions for number of incidents per day using actual daily counts, 406 applying year-based weighting to reflect trends (e.g., 2024 busier than 2023), 407 stratified by season and quarter. 408 """ 409 import math 410 import json 411 import numpy as np 412 413 inc_df = self.df[['inc_date', 'date_only', 'quarter']].copy() 414 inc_df['year'] = inc_df['date_only'].dt.year 415 416 # Daily incident counts 417 inc_per_day = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day') 418 inc_per_day['year'] = inc_per_day['date_only'].dt.year 419 420 # Merge quarter and season from self.df 421 date_info = self.df[['date_only', 'quarter']].drop_duplicates() 422 423 if 'season' not in self.df.columns: 424 date_info['season'] = date_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 425 else: 426 date_info = date_info.merge( 427 self.df[['date_only', 'season']].drop_duplicates(), 428 on='date_only', 429 how='left' 430 ) 431 432 inc_per_day = inc_per_day.merge(date_info, on='date_only', how='left') 433 434 # Weight settings - simple implementation rather than biased mean thing 435 year_weights = { 436 2023: 1.0, 437 2024: 4.0 # 10% more weight to 2024 438 } 439 440 # ========== SEASONAL DISTRIBUTIONS ========== 441 jpd_distr = [] 442 443 for season in inc_per_day['season'].dropna().unique(): 444 filtered = inc_per_day[inc_per_day['season'] == season].copy() 445 filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0) 446 447 # Repeat rows proportionally by weight 448 replicated = filtered.loc[ 449 filtered.index.repeat((filtered['weight'] * 10).round().astype(int)) 450 ]['jobs_per_day'] 451 452 best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus) 453 454 jpd_distr.append({ 455 "season": season, 456 "best_fit": best_fit, 457 "min_n_per_day": int(replicated.min()), 458 "max_n_per_day": int(replicated.max()), 459 "mean_n_per_day": float(replicated.mean()) 460 }) 461 462 with open('distribution_data/inc_per_day_distributions.txt', 'w+') as f: 463 json.dump(jpd_distr, f) 464 465 # ========== QUARTERLY DISTRIBUTIONS ========== 466 jpd_qtr_distr = [] 467 468 for quarter in inc_per_day['quarter'].dropna().unique(): 469 filtered = inc_per_day[inc_per_day['quarter'] == quarter].copy() 470 filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0) 471 472 replicated = filtered.loc[ 473 filtered.index.repeat((filtered['weight'] * 10).round().astype(int)) 474 ]['jobs_per_day'] 475 476 best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus) 477 478 jpd_qtr_distr.append({ 479 "quarter": int(quarter), 480 "best_fit": best_fit, 481 "min_n_per_day": int(replicated.min()), 482 "max_n_per_day": int(replicated.max()), 483 "mean_n_per_day": float(replicated.mean()) 484 }) 485 486 with open('distribution_data/inc_per_day_qtr_distributions.txt', 'w+') as f: 487 json.dump(jpd_qtr_distr, f) 488 489 def incidents_per_day_samples(self, weight_map=None, scale_factor=10): 490 """ 491 Create weighted empirical samples of incidents per day by season and quarter. 492 """ 493 494 inc_df = self.df[['date_only', 'quarter']].copy() 495 inc_df['year'] = inc_df['date_only'].dt.year 496 inc_df['season'] = inc_df['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 497 498 # Get raw counts per day 499 daily_counts = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day') 500 daily_counts['year'] = daily_counts['date_only'].dt.year 501 502 # Merge back in season/quarter info 503 meta_info = self.df[['date_only', 'quarter']].drop_duplicates() 504 if 'season' in self.df.columns: 505 meta_info = meta_info.merge( 506 self.df[['date_only', 'season']].drop_duplicates(), 507 on='date_only', how='left' 508 ) 509 else: 510 meta_info['season'] = meta_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 511 512 daily_counts = daily_counts.merge(meta_info, on='date_only', how='left') 513 514 # Year weight map 515 if weight_map is None: 516 weight_map = {2023: 1.0, 2024: 1.1} 517 518 # Compute weights 519 daily_counts['weight'] = daily_counts['year'].map(weight_map).fillna(1.0) 520 521 # Storage 522 empirical_samples = {} 523 524 # Season-based 525 for season in daily_counts['season'].dropna().unique(): 526 filtered = daily_counts[daily_counts['season'] == season].copy() 527 repeated = filtered.loc[ 528 filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int)) 529 ]['jobs_per_day'].tolist() 530 531 empirical_samples[season] = repeated 532 533 # Quarter-based 534 for quarter in daily_counts['quarter'].dropna().unique(): 535 filtered = daily_counts[daily_counts['quarter'] == quarter].copy() 536 repeated = filtered.loc[ 537 filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int)) 538 ]['jobs_per_day'].tolist() 539 540 empirical_samples[f"Q{int(quarter)}"] = repeated 541 542 with open("distribution_data/inc_per_day_samples.json", 'w') as f: 543 json.dump(empirical_samples, f) 544 545 546 def enhanced_or_critical_care_by_ampds_card_probs(self): 547 """ 548 549 Calculates the probabilty of enhanced or critical care resource beign required 550 based on the AMPDS card 551 552 """ 553 554 ec_df = self.df[['ampds_card', 'ec_benefit', 'cc_benefit']].copy() 555 556 def assign_care_category(row): 557 # There are some columns with both EC and CC benefit selected 558 # this function will allocate to only 1 559 if row['cc_benefit'] == 'y': 560 return 'CC' 561 elif row['ec_benefit'] == 'y': 562 return 'EC' 563 else: 564 return 'REG' 565 566 ec_df['care_category'] = ec_df.apply(assign_care_category, axis = 1) 567 568 care_cat_counts = ec_df.groupby(['ampds_card', 'care_category']).size().reset_index(name='count') 569 total_counts = care_cat_counts.groupby('ampds_card')['count'].transform('sum') 570 571 care_cat_counts['proportion'] = round(care_cat_counts['count'] / total_counts, 3) 572 573 care_cat_counts.to_csv('distribution_data/enhanced_or_critical_care_by_ampds_card_probs.csv', mode = "w+", index = False) 574 575 def patient_outcome_by_care_category_and_quarter_probs(self): 576 """ 577 578 Calculates the probabilty of a patient outcome based on care category and yearly quarter 579 580 """ 581 582 po_df = self.df[['quarter', 'ec_benefit', 'cc_benefit', 'pt_outcome']].copy() 583 584 def assign_care_category(row): 585 # There are some columns with both EC and CC benefit selected 586 # this function will allocate to only 1 587 if row['cc_benefit'] == 'y': 588 return 'CC' 589 elif row['ec_benefit'] == 'y': 590 return 'EC' 591 else: 592 return 'REG' 593 594 # There are some values that are missing e.g. CC quarter 1 Deceased 595 # I think we've had problems when trying to sample from this kind of thing before 596 # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0 597 outcomes = po_df['pt_outcome'].unique() 598 care_categories = ['CC', 'EC', 'REG'] 599 quarters = po_df['quarter'].unique() 600 601 all_combinations = pd.DataFrame(list(itertools.product(outcomes, care_categories, quarters)), 602 columns=['pt_outcome', 'care_category', 'quarter']) 603 604 po_df['care_category'] = po_df.apply(assign_care_category, axis = 1) 605 606 po_cat_counts = po_df.groupby(['pt_outcome', 'care_category', 'quarter']).size().reset_index(name='count') 607 608 merged = pd.merge(all_combinations, po_cat_counts, 609 on=['pt_outcome', 'care_category', 'quarter'], 610 how='left').fillna({'count': 0}) 611 merged['count'] = merged['count'].astype(int) 612 613 total_counts = merged.groupby(['care_category', 'quarter'])['count'].transform('sum') 614 merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3) 615 616 merged.to_csv('distribution_data/patient_outcome_by_care_category_and_quarter_probs.csv', mode = "w+", index = False) 617 618 # def hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs(self): 619 # """ 620 621 # Calculates the probabilty of a given HEMS result based on 622 # patient outcome, yearly quarter, vehicle type and callsign group 623 624 # """ 625 626 # hr_df = self.df[['hems_result', 'quarter', 'pt_outcome', 'vehicle_type', 'callsign_group']].copy() 627 628 # # There are some values that are missing e.g. CC quarter 1 Deceased 629 # # I think we've had problems when trying to sample from this kind of thing before 630 # # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0 631 # # hems_results = hr_df['hems_result'].unique() 632 # # outcomes = hr_df['pt_outcome'].unique() 633 # # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)] 634 # # callsign_group_categories = hr_df['callsign_group'].unique() 635 # # quarters = hr_df['quarter'].unique() 636 637 # # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)), 638 # # columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']) 639 640 # hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']).size().reset_index(name='count') 641 642 # # merged = pd.merge(all_combinations, hr_cat_counts, 643 # # on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'], 644 # # how='left').fillna({'count': 0}) 645 # # merged['count'] = merged['count'].astype(int) 646 647 # merged = hr_cat_counts 648 649 # total_counts = merged.groupby(['pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'])['count'].transform('sum') 650 # merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3) 651 652 # merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False) 653 654 def hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs(self): 655 """ 656 657 Calculates the probabilty of a given HEMS result based on 658 - patient outcome 659 - yearly quarter 660 - time of day (7am - 6pm, 7pm - 6am) 661 - vehicle type 662 - and callsign group 663 664 """ 665 self.df['inc_date'] = pd.to_datetime(self.df['inc_date']) 666 self.df['hour'] = self.df['inc_date'].dt.hour 667 self.df['time_of_day'] = self.df['hour'].apply(lambda x: 'day' if x >= 7 and x <= 18 else "night") 668 669 hr_df = self.df[[ 670 'hems_result', 'quarter', 'pt_outcome', 671 'vehicle_type', 'callsign_group', 'time_of_day' 672 ]].copy() 673 674 # There are some values that are missing e.g. CC quarter 1 Deceased 675 # I think we've had problems when trying to sample from this kind of thing before 676 # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0 677 # hems_results = hr_df['hems_result'].unique() 678 # outcomes = hr_df['pt_outcome'].unique() 679 # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)] 680 # callsign_group_categories = hr_df['callsign_group'].unique() 681 # quarters = hr_df['quarter'].unique() 682 683 # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)), 684 # columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']) 685 686 hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome', 687 'vehicle_type', 'callsign_group', 688 'quarter', 'time_of_day']).size().reset_index(name='count') 689 690 # merged = pd.merge(all_combinations, hr_cat_counts, 691 # on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'], 692 # how='left').fillna({'count': 0}) 693 # merged['count'] = merged['count'].astype(int) 694 695 merged = hr_cat_counts 696 697 total_counts = merged.groupby(['pt_outcome', 698 'vehicle_type', 'callsign_group', 699 'quarter', 'time_of_day'])['count'].transform('sum') 700 merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3) 701 702 merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False) 703 704 705 706 def hourly_arrival_by_qtr_probs(self): 707 """ 708 709 Calculates the proportions of calls arriving in any given hour 710 stratified by yearly quarter 711 712 """ 713 714 ia_df = self.df[['quarter', 'hour']].dropna() 715 716 hourly_counts = ia_df.groupby(['hour', 'quarter']).size().reset_index(name='count') 717 total_counts = hourly_counts.groupby(['quarter'])['count'].transform('sum') 718 hourly_counts['proportion'] = round(hourly_counts['count'] / total_counts, 4) 719 720 hourly_counts.sort_values(by=['quarter', 'hour']).to_csv('distribution_data/hourly_arrival_by_qtr_probs.csv', mode="w+") 721 722 723 def callsign_group_by_ampds_card_and_hour_probs(self): 724 """ 725 726 Calculates the probabilty of a specific callsign being allocated to 727 a call based on the AMPDS card category and hour of day 728 729 """ 730 callsign_counts = self.df.groupby(['ampds_card', 'hour', 'callsign_group']).size().reset_index(name='count') 731 732 total_counts = callsign_counts.groupby(['ampds_card', 'hour'])['count'].transform('sum') 733 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 734 735 callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_and_hour_probs.csv', mode = "w+", index=False) 736 737 def callsign_group_by_ampds_card_probs(self): 738 """ 739 740 Calculates the probabilty of a specific callsign being allocated to 741 a call based on the AMPDS card category 742 743 """ 744 745 callsign_df = self.df[self.df['callsign_group'] != 'Other'] 746 747 callsign_counts = callsign_df.groupby(['ampds_card', 'callsign_group']).size().reset_index(name='count') 748 749 total_counts = callsign_counts.groupby(['ampds_card'])['count'].transform('sum') 750 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 751 752 callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_probs.csv', mode = "w+", index=False) 753 754 def callsign_group(self): 755 """ 756 Calculates the probabilty of a specific callsign being allocated to 757 a call 758 """ 759 df = self.df.copy() 760 761 # Convert time fields to numeric 762 time_fields = [ 763 "time_allocation", "time_mobile", "time_to_scene", 764 "time_on_scene", "time_to_hospital", "time_to_clear" 765 ] 766 for col in time_fields: 767 df[col] = pd.to_numeric(df[col], errors="coerce") 768 769 # Calculate total job duration in minutes 770 df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True) 771 772 # Compute job start and end times 773 df["start_time"] = df["inc_date"] 774 df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m") 775 776 # Sort jobs by start time 777 df = df.sort_values(by="start_time").reset_index(drop=True) 778 779 # Set to hold indices of jobs that overlap (but only the later-starting ones) 780 overlapping = set() 781 782 # Check for overlaps 783 for i in range(len(df)): 784 this_end = df.at[i, "end_time"] 785 786 # Compare only to later jobs 787 for j in range(i + 1, len(df)): 788 next_start = df.at[j, "start_time"] 789 if next_start >= this_end: 790 break # No more possible overlaps 791 # If it starts before i's job ends, it's overlapping 792 overlapping.add(j) 793 794 # Mark the overlaps in the dataframe 795 df["overlaps"] = df.index.isin(overlapping) 796 797 # Filter out overlapping jobs 798 df_no_overlap = df[~df["overlaps"]] 799 800 # We will use the ad-hoc unavailability to remove any instances where we already know one of 801 # the vehicles to be recorded as offline 802 803 data = df_no_overlap.copy() 804 805 # TODO: Ideally we'd also remove any instances where we know one of the helos to have been 806 # off for servicing if that data is available 807 ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"]) 808 ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower() 809 810 data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601") 811 data["vehicle"] = data["vehicle"].str.lower() 812 813 # Create a cross-join between data and ad_hoc 814 data['key'] = 1 815 ad_hoc['key'] = 1 816 merged = data.merge(ad_hoc, on='key') 817 818 # Keep rows where inc_date falls within the offline period 819 overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])] 820 821 # Filter out those rows from the original data 822 df_no_overlap = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key') 823 824 callsign_df = ( 825 df_no_overlap 826 .assign( 827 helicopter_benefit=np.select( 828 [ 829 df_no_overlap["cc_benefit"] == "y", 830 df_no_overlap["ec_benefit"] == "y", 831 df_no_overlap["hems_result"].isin([ 832 "Stand Down En Route", 833 "Landed but no patient contact", 834 "Stand Down Before Mobile" 835 ]) 836 ], 837 ["y", "y", "n"], 838 default=df_no_overlap["helicopter_benefit"] 839 ), 840 care_category=np.select( 841 [ 842 df_no_overlap["cc_benefit"] == "y", 843 df_no_overlap["ec_benefit"] == "y" 844 ], 845 ["CC", "EC"], 846 default="REG" 847 ) 848 ) 849 ) 850 851 callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other'] 852 853 callsign_counts = callsign_df.groupby(['callsign_group']).size().reset_index(name='count') 854 855 total_counts = len(callsign_df) 856 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 857 858 callsign_counts.to_csv('distribution_data/callsign_group_probs.csv', mode = "w+", index=False) 859 860 861 # def callsign_group_by_care_category(self): 862 # """ 863 864 # Calculates the probabilty of a specific callsign being allocated to 865 # a call based on the care category 866 867 # """ 868 # df = self.df.copy() 869 870 # # Convert time fields to numeric 871 # time_fields = [ 872 # "time_allocation", "time_mobile", "time_to_scene", 873 # "time_on_scene", "time_to_hospital", "time_to_clear" 874 # ] 875 # for col in time_fields: 876 # df[col] = pd.to_numeric(df[col], errors="coerce") 877 878 # # Calculate total job duration in minutes 879 # df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True) 880 881 # # Compute job start and end times 882 # df["start_time"] = df["inc_date"] 883 # df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m") 884 885 # # Sort jobs by start time 886 # df = df.sort_values(by="start_time").reset_index(drop=True) 887 888 # # Set to hold indices of jobs that overlap (but only the later-starting ones) 889 # overlapping = set() 890 891 # # Check for overlaps 892 # for i in range(len(df)): 893 # this_end = df.at[i, "end_time"] 894 895 # # Compare only to later jobs 896 # for j in range(i + 1, len(df)): 897 # next_start = df.at[j, "start_time"] 898 # if next_start >= this_end: 899 # break # No more possible overlaps 900 # # If it starts before i's job ends, it's overlapping 901 # overlapping.add(j) 902 903 # # Mark the overlaps in the dataframe 904 # df["overlaps"] = df.index.isin(overlapping) 905 906 # # Filter out overlapping jobs 907 # df_no_overlap = df[~df["overlaps"]] 908 909 910 # callsign_df = ( 911 # df_no_overlap 912 # .assign( 913 # helicopter_benefit=np.select( 914 # [ 915 # df_no_overlap["cc_benefit"] == "y", 916 # df_no_overlap["ec_benefit"] == "y", 917 # df_no_overlap["hems_result"].isin([ 918 # "Stand Down En Route", 919 # "Landed but no patient contact", 920 # "Stand Down Before Mobile" 921 # ]) 922 # ], 923 # ["y", "y", "n"], 924 # default=df_no_overlap["helicopter_benefit"] 925 # ), 926 # care_category=np.select( 927 # [ 928 # df_no_overlap["cc_benefit"] == "y", 929 # df_no_overlap["ec_benefit"] == "y" 930 # ], 931 # ["CC", "EC"], 932 # default="REG" 933 # ) 934 # ) 935 # ) 936 937 # callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other'] 938 939 # callsign_counts = callsign_df.groupby(['care_category', 'callsign_group']).size().reset_index(name='count') 940 941 # total_counts = callsign_counts.groupby(['care_category'])['count'].transform('sum') 942 # callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 943 944 # callsign_counts.to_csv('distribution_data/callsign_group_by_care_category_probs.csv', mode = "w+", index=False) 945 946 #========== ARCHIVED CODE ============ # 947 # def vehicle_type_by_month_probs(self): 948 # """ 949 950 # Calculates the probabilty of a car/helicopter being allocated to 951 # a call based on the callsign group and month of the year 952 953 # """ 954 # callsign_counts = self.df.groupby(['callsign_group', 'month', 'vehicle_type']).size().reset_index(name='count') 955 956 # total_counts = callsign_counts.groupby(['callsign_group', 'month'])['count'].transform('sum') 957 # callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 958 959 # callsign_counts.to_csv('distribution_data/vehicle_type_by_month_probs.csv', mode = "w+") 960 #========== END ARCHIVED CODE ============ # 961 def vehicle_type_by_quarter_probs(self): 962 """ 963 964 Calculates the probabilty of a car/helicopter being allocated to 965 a call based on the callsign group and quarter of the year 966 967 Quarter accounts for seasonal variation without being as affected by 968 969 """ 970 data = self.df.copy() 971 972 # We will use the ad-hoc unavailability to remove any instances where we already know one of 973 # the vehicles to be recorded as offline 974 975 # TODO: Ideally we'd also remove any instances where we know one of the helos to have been 976 # off for servicing if that data is available 977 ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"]) 978 ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower() 979 980 data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601") 981 data["vehicle"] = data["vehicle"].str.lower() 982 983 # Create a cross-join between data and ad_hoc 984 data['key'] = 1 985 ad_hoc['key'] = 1 986 merged = data.merge(ad_hoc, on='key') 987 988 # Keep rows where inc_date falls within the offline period 989 overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])] 990 991 # Filter out those rows from the original data 992 filtered_data = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key') 993 994 # First, calculate overall props 995 callsign_counts = filtered_data.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count') 996 997 total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum') 998 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 999 1000 callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+") 1001 1002 # Then, redo by quarter 1003 callsign_counts = filtered_data.groupby(['callsign_group', 'quarter', 'vehicle_type']).size().reset_index(name='count') 1004 1005 total_counts = callsign_counts.groupby(['callsign_group', 'quarter'])['count'].transform('sum') 1006 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 1007 1008 callsign_counts.to_csv('distribution_data/vehicle_type_by_quarter_probs.csv', mode = "w+") 1009 1010 def vehicle_type_probs(self): 1011 """ 1012 1013 Calculates the probabilty of a car/helicopter being allocated to 1014 a call based on the callsign group 1015 1016 """ 1017 1018 callsign_counts = self.df.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count') 1019 1020 total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum') 1021 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 1022 1023 callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+") 1024 1025 def hems_result_by_callsign_group_and_vehicle_type_probs(self): 1026 """ 1027 1028 Calculates the probabilty of a specific HEMS result being allocated to 1029 a call based on the callsign group and hour of day 1030 1031 TODO: These probability calculation functions could probably be refactored into a single 1032 function and just specify columns and output name 1033 1034 """ 1035 hems_counts = self.df.groupby(['hems_result', 'callsign_group', 'vehicle_type']).size().reset_index(name='count') 1036 1037 total_counts = hems_counts.groupby(['callsign_group', 'vehicle_type'])['count'].transform('sum') 1038 hems_counts['proportion'] = round(hems_counts['count'] / total_counts, 4) 1039 1040 hems_counts.to_csv('distribution_data/hems_result_by_callsign_group_and_vehicle_type_probs.csv', mode = "w+", index=False) 1041 1042 1043 #========== ARCHIVED CODE ============ # 1044 # def hems_result_by_care_cat_and_helicopter_benefit_probs(self): 1045 # """ 1046 1047 # Calculates the probabilty of a specific HEMS result being allocated to 1048 # a call based on the care category amd whether a helicopter is beneficial 1049 1050 # """ 1051 1052 # # Wrangle the data...trying numpy for a change 1053 1054 # hems_df = ( 1055 # self.df 1056 # .assign( 1057 # helicopter_benefit=np.select( 1058 # [ 1059 # self.df["cc_benefit"] == "y", 1060 # self.df["ec_benefit"] == "y", 1061 # self.df["hems_result"].isin([ 1062 # "Stand Down En Route", 1063 # "Landed but no patient contact", 1064 # "Stand Down Before Mobile" 1065 # ]) 1066 # ], 1067 # ["y", "y", "n"], 1068 # default=self.df["helicopter_benefit"] 1069 # ), 1070 # care_cat=np.select( 1071 # [ 1072 # self.df["cc_benefit"] == "y", 1073 # self.df["ec_benefit"] == "y" 1074 # ], 1075 # ["CC", "EC"], 1076 # default="REG" 1077 # ) 1078 # ) 1079 # ) 1080 1081 # hems_counts = hems_df.groupby(['hems_result', 'care_cat', 'helicopter_benefit']).size().reset_index(name='count') 1082 1083 # hems_counts['total'] = hems_counts.groupby(['care_cat', 'helicopter_benefit'])['count'].transform('sum') 1084 # hems_counts['proportion'] = round(hems_counts['count'] / hems_counts['total'], 4) 1085 1086 # hems_counts.to_csv('distribution_data/hems_result_by_care_cat_and_helicopter_benefit_probs.csv', mode = "w+", index=False) 1087 #========== END ARCHIVED CODE ============ # 1088 1089 #========== ARCHIVED CODE ============ # 1090 # def pt_outcome_by_hems_result_and_care_category_probs(self): 1091 # """ 1092 1093 # Calculates the probabilty of a specific patient outcome based on HEMS result 1094 1095 # """ 1096 1097 # hems_df = ( 1098 # self.df 1099 # .assign( 1100 # helicopter_benefit=np.select( 1101 # [ 1102 # self.df["cc_benefit"] == "y", 1103 # self.df["ec_benefit"] == "y", 1104 # self.df["hems_result"].isin([ 1105 # "Stand Down En Route", 1106 # "Landed but no patient contact", 1107 # "Stand Down Before Mobile" 1108 # ]) 1109 # ], 1110 # ["y", "y", "n"], 1111 # default=self.df["helicopter_benefit"] 1112 # ), 1113 # care_category=np.select( 1114 # [ 1115 # self.df["cc_benefit"] == "y", 1116 # self.df["ec_benefit"] == "y" 1117 # ], 1118 # ["CC", "EC"], 1119 # default="REG" 1120 # ) 1121 # ) 1122 # ) 1123 1124 # po_counts = hems_df.groupby(['pt_outcome', 'hems_result', 'care_category']).size().reset_index(name='count') 1125 1126 # po_counts['total'] = po_counts.groupby(['hems_result', 'care_category'])['count'].transform('sum') 1127 # po_counts['proportion'] = round(po_counts['count'] / po_counts['total'], 4) 1128 1129 # po_counts.to_csv('distribution_data/pt_outcome_by_hems_result_and_care_category_probs.csv', mode = "w+") 1130 #========== END ARCHIVED CODE ============ # 1131 1132 def school_holidays(self) -> None: 1133 """" 1134 Function to generate a CSV file containing schoole holiday 1135 start and end dates for a given year. The Year range is determined 1136 by the submitted data (plus a year at the end of the study for good measure) 1137 """ 1138 1139 min_date = self.df.inc_date.min() 1140 max_date = self.df.inc_date.max() + timedelta(weeks = (52 * self.school_holidays_years)) 1141 1142 u = Utils() 1143 1144 years_of_holidays_list = u.years_between(min_date, max_date) 1145 1146 sh = pd.DataFrame(columns=['year', 'start_date', 'end_date']) 1147 1148 for i, year in enumerate(years_of_holidays_list): 1149 tmp = u.calculate_term_holidays(year) 1150 1151 if i == 0: 1152 sh = tmp 1153 else: 1154 sh = pd.concat([sh, tmp]) 1155 1156 sh.to_csv('actual_data/school_holidays.csv', index = False) 1157 1158 1159# These functions are to wrangle historical data to provide comparison against the simulation outputs 1160 1161 def historical_jobs_per_day_per_callsign(self): 1162 df = self.df 1163 1164 df["date"] = pd.to_datetime(df["inc_date"]).dt.date 1165 all_counts_hist = df.groupby(["date", "callsign"])["job_id"].count().reset_index() 1166 all_counts_hist.rename(columns={'job_id':'jobs_in_day'}, inplace=True) 1167 1168 all_combinations = pd.DataFrame( 1169 list(itertools.product(df['date'].unique(), df['callsign'].unique())), 1170 columns=['date', 'callsign'] 1171 ).dropna() 1172 1173 merged = all_combinations.merge(all_counts_hist, on=['date', 'callsign'], how='left') 1174 merged['jobs_in_day'] = merged['jobs_in_day'].fillna(0).astype(int) 1175 1176 all_counts = merged.groupby(['callsign', 'jobs_in_day']).count().reset_index().rename(columns={"date":"count"}) 1177 all_counts.to_csv("historical_data/historical_jobs_per_day_per_callsign.csv", index=False) 1178 1179 def historical_care_cat_counts(self): 1180 """ 1181 Process historical incident data to categorize care types and compute hourly counts. 1182 1183 This method performs the following steps: 1184 - Converts incident dates to datetime format. 1185 - Extracts month start and hour from the incident date. 1186 - Categorizes each incident into care categories based on benefit flags and attendance. 1187 - Counts the number of incidents by hour and care category. 1188 - Outputs these counts to a CSV file. 1189 - Computes and writes the proportion of regular care jobs with a helicopter benefit 1190 (excluding those not attended by a DAA resource) to a text file. 1191 1192 Outputs: 1193 - CSV file: 'historical_data/historical_care_cat_counts.csv' 1194 - Text file: 'distribution_data/proportion_jobs_heli_benefit.txt' 1195 """ 1196 1197 df_historical = self.df 1198 1199 df_historical['inc_date'] = pd.to_datetime(df_historical['inc_date']) 1200 # Extract the first day of the month and the hour of each incident 1201 df_historical['month_start'] = df_historical.inc_date.dt.strftime("%Y-%m-01") 1202 df_historical['hour'] = df_historical.inc_date.dt.hour 1203 1204 conditions = [ 1205 df_historical['cc_benefit'] == 'y', 1206 df_historical['ec_benefit'] == 'y', 1207 df_historical['helicopter_benefit'] == 'y', 1208 df_historical['callsign_group'] == 'Other' 1209 ] 1210 1211 choices = [ 1212 'CC', 1213 'EC', 1214 'REG - helicopter benefit', 1215 'Unknown - DAA resource did not attend' 1216 ] 1217 # Assign care category to each record 1218 # If the case did not meet any of the criteria in 'conditions', it will default 1219 # to being labelled as a 'regular/REG' case (i.e there was no benefit recorded) 1220 df_historical['care_category'] = np.select(conditions, choices, default='REG') 1221 1222 # Count occurrences grouped by hour and care category 1223 historical_value_counts_by_hour = ( 1224 df_historical.value_counts(["hour", "care_category"]) 1225 .reset_index(name="count") 1226 ) 1227 # Output to CSV for use in tests and visualisations 1228 (historical_value_counts_by_hour 1229 .sort_values(['hour', 'care_category']) 1230 .to_csv("historical_data/historical_care_cat_counts.csv")) 1231 1232 # Also output the % of regular (not cc/ec) jobs with a helicopter benefit 1233 # These are the regular jobs we will make an assumption follow different logic due to having an obvious expected 1234 # patient benefit of having a helicopter allocated to them that we will have to assume is apparent at the time 1235 # of the call being placed (such as the casualty being located in a remote location, or ) 1236 1237 numerator = (historical_value_counts_by_hour[ 1238 historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit" 1239 ]["count"].sum()) 1240 1241 denominator = (historical_value_counts_by_hour[ 1242 (historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit") | 1243 (historical_value_counts_by_hour["care_category"] == "REG") 1244 ]["count"].sum()) 1245 1246 with open('distribution_data/proportion_jobs_heli_benefit.txt', 'w+') as heli_benefit_file: 1247 heli_benefit_file.write(json.dumps((numerator/denominator).round(4))) 1248 1249 # Count occurrences grouped by hour and care category 1250 historical_value_counts_by_hour_cc_ec = ( 1251 df_historical.value_counts(["hour", "care_category", "helicopter_benefit"]) 1252 .reset_index(name="count") 1253 ) 1254 1255 # Output to CSV for use in tests and visualisations 1256 # (historical_value_counts_by_hour_cc_ec 1257 # .sort_values(["hour", "care_category", "helicopter_benefit"]) 1258 # .to_csv("historical_data/historical_care_cat_counts_cc_ec.csv")) 1259 1260 1261 numerator_cc = ( 1262 historical_value_counts_by_hour_cc_ec[ 1263 (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") & 1264 (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y") 1265 ]["count"].sum()) 1266 1267 denominator_cc = ( 1268 historical_value_counts_by_hour_cc_ec[ 1269 (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") 1270 ]["count"].sum()) 1271 1272 with open('distribution_data/proportion_jobs_heli_benefit_cc.txt', 'w+') as heli_benefit_file: 1273 heli_benefit_file.write(json.dumps((numerator_cc/denominator_cc).round(4))) 1274 1275 numerator_ec = ( 1276 historical_value_counts_by_hour_cc_ec[ 1277 (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") & 1278 (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y") 1279 ]["count"].sum()) 1280 1281 denominator_ec = ( 1282 historical_value_counts_by_hour_cc_ec[ 1283 (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") 1284 ]["count"].sum()) 1285 1286 with open('distribution_data/proportion_jobs_heli_benefit_ec.txt', 'w+') as heli_benefit_file: 1287 heli_benefit_file.write(json.dumps((numerator_ec/denominator_ec).round(4))) 1288 1289 1290 1291 def historical_monthly_totals(self): 1292 """ 1293 Calculates monthly incident totals from provided dataset of historical data 1294 """ 1295 1296 # Multiple resources can be sent to the same job. 1297 monthly_df = self.df[['inc_date', 'first_day_of_month', 'hems_result', 'vehicle_type']]\ 1298 .drop_duplicates(subset="inc_date", keep="first") 1299 1300 is_stand_down = monthly_df['hems_result'].str.contains("Stand Down") 1301 monthly_df['stand_down_car'] = ((monthly_df['vehicle_type'] == "car") & is_stand_down).astype(int) 1302 monthly_df['stand_down_helicopter'] = ((monthly_df['vehicle_type'] == "helicopter") & is_stand_down).astype(int) 1303 1304 monthly_totals_df = monthly_df.groupby('first_day_of_month').agg( 1305 stand_down_car=('stand_down_car', 'sum'), 1306 stand_down_helicopter=('stand_down_helicopter', 'sum'), 1307 total_jobs=('vehicle_type', 'size') 1308 ).reset_index() 1309 1310 monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_jobs_per_month.csv', mode="w+", index=False) 1311 1312 def historical_monthly_totals_all_calls(self): 1313 """ 1314 Calculates monthly incident totals from provided dataset of historical data stratified by callsign 1315 """ 1316 1317 # Multiple resources can be sent to the same job. 1318 monthly_df = self.df[['inc_date', 'first_day_of_month']].dropna() 1319 1320 monthly_totals_df = monthly_df.groupby(['first_day_of_month']).count().reset_index() 1321 1322 monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_all_calls.csv', mode="w+", index=False) 1323 1324 def historical_monthly_totals_by_callsign(self): 1325 """ 1326 Calculates monthly incident totals from provided dataset of historical data stratified by callsign 1327 """ 1328 1329 # Multiple resources can be sent to the same job. 1330 monthly_df = self.df[['inc_date', 'first_day_of_month', 'callsign']].dropna() 1331 1332 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'callsign']).count().reset_index() 1333 1334 #print(monthly_totals_df.head()) 1335 1336 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1337 1338 #print(monthly_totals_pivot_df.head()) 1339 1340 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_callsign.csv', mode="w+", index=False) 1341 1342 def historical_monthly_totals_by_hour_of_day(self): 1343 """ 1344 Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day 1345 """ 1346 1347 # Multiple resources can be sent to the same job. 1348 monthly_df = self.df[['inc_date', 'first_day_of_month', 'hour']].dropna()\ 1349 .drop_duplicates(subset="inc_date", keep="first") 1350 1351 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'hour']).count().reset_index() 1352 1353 #print(monthly_totals_df.head()) 1354 1355 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='hour', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1356 1357 #print(monthly_totals_pivot_df.head()) 1358 1359 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_hour_of_day.csv', mode="w+", index=False) 1360 1361 def historical_monthly_totals_by_day_of_week(self): 1362 """ 1363 Calculates number of incidents per month stratified by day of the week 1364 """ 1365 1366 # Multiple resources can be sent to the same job. 1367 monthly_df = self.df[['inc_date', 'first_day_of_month', 'day_of_week']].dropna()\ 1368 .drop_duplicates(subset="inc_date", keep="first") 1369 1370 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'day_of_week']).count().reset_index() 1371 1372 #print(monthly_totals_df.head()) 1373 1374 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='day_of_week', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1375 1376 #print(monthly_totals_pivot_df.head()) 1377 1378 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_day_of_week.csv', mode="w+", index=False) 1379 1380 def historical_median_time_of_activities_by_month_and_resource_type(self): 1381 """ 1382 Calculate the median time for each of the job cycle phases stratified by month and vehicle type 1383 """ 1384 1385 median_df = self.df[['first_day_of_month', 'time_allocation', 1386 'time_mobile', 'time_to_scene', 'time_on_scene', 1387 'time_to_hospital', 'time_to_clear', 'vehicle_type']].copy() 1388 1389 median_df['total_job_time'] = median_df[[ 1390 'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 1391 'time_to_hospital', 'time_to_clear']].sum(axis=1, skipna=True) 1392 1393 # Replacing zeros with NaN to exclude from median calculation 1394 # since if an HEMS result is Stood down en route, then time_on_scene would be zero and affect the median 1395 # median_df.replace(0, np.nan, inplace=True) 1396 1397 # Grouping by month and resource_type, calculating medians 1398 median_times = median_df.groupby(['first_day_of_month', 'vehicle_type']).median(numeric_only=True).reset_index() 1399 1400 pivot_data = median_times.pivot_table( 1401 index='first_day_of_month', 1402 columns='vehicle_type', 1403 values=['time_allocation', 'time_mobile', 'time_to_scene', 1404 'time_on_scene', 'time_to_hospital', 'time_to_clear', 'total_job_time'] 1405 ) 1406 1407 pivot_data.columns = [f"median_{col[1]}_{col[0]}" for col in pivot_data.columns] 1408 pivot_data = pivot_data.reset_index() 1409 1410 pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False) 1411 1412 def historical_monthly_resource_utilisation(self): 1413 """ 1414 Calculates number of, and time spent on, incidents per month stratified by callsign 1415 """ 1416 1417 # Multiple resources can be sent to the same job. 1418 monthly_df = self.df[[ 1419 'inc_date', 'first_day_of_month', 'callsign', 'time_allocation', 1420 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 1421 'time_to_clear']].copy() 1422 1423 monthly_df['total_time'] = monthly_df.filter(regex=r'^time_').sum(axis=1) 1424 1425 monthly_totals_df = monthly_df.groupby(['callsign', 'first_day_of_month'], as_index=False)\ 1426 .agg(n = ('callsign', 'size'), total_time = ('total_time', 'sum')) 1427 1428 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values=['n', 'total_time']) 1429 1430 monthly_totals_pivot_df.columns = [f"{col[0]}_{col[1]}" for col in monthly_totals_pivot_df.columns] 1431 monthly_totals_pivot_df = monthly_totals_pivot_df.reset_index() 1432 1433 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_resource_utilisation.csv', mode="w+", index=False) 1434 1435 def historical_daily_calls_breakdown(self): 1436 1437 df = self.df 1438 # Convert inc_date to date only (remove time) 1439 df['date'] = pd.to_datetime(df['inc_date']).dt.date 1440 1441 # Count number of calls per day 1442 calls_in_day_breakdown = df.groupby('date').size().reset_index(name='calls_in_day') 1443 1444 # Save the daily call counts with a 'day' index column 1445 calls_in_day_breakdown_with_day = calls_in_day_breakdown.copy() 1446 calls_in_day_breakdown_with_day.insert(0, 'day', range(1, len(calls_in_day_breakdown) + 1)) 1447 calls_in_day_breakdown_with_day.drop(columns='date').to_csv('historical_data/historical_daily_calls_breakdown.csv', index=False) 1448 1449 # Count how many days had the same number of calls 1450 calls_per_day_summary = calls_in_day_breakdown['calls_in_day'].value_counts().reset_index() 1451 calls_per_day_summary.columns = ['calls_in_day', 'days'] 1452 calls_per_day_summary.to_csv('historical_data/historical_daily_calls.csv', index=False) 1453 1454 def historical_missed_jobs(self): 1455 df = self.df 1456 df["date"] = pd.to_datetime(df["inc_date"]) 1457 df["hour"] = df["date"].dt.hour 1458 df["month_start"] = df["date"].dt.strftime("%Y-%m-01") 1459 df["callsign_group_simplified"] = df["callsign_group"].apply(lambda x: "No HEMS available" if x=="Other" else "HEMS (helo or car) available and sent") 1460 df["quarter"] = df["inc_date"].dt.quarter 1461 1462 # By month 1463 count_df_month = df[["callsign_group_simplified", "month_start"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','month_start']) 1464 count_df_month.to_csv("historical_data/historical_missed_calls_by_month.csv", index=False) 1465 1466 # By hour 1467 count_df = df[["callsign_group_simplified", "hour"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','hour']) 1468 count_df.to_csv("historical_data/historical_missed_calls_by_hour.csv", index=False) 1469 1470 # By quarter and hour 1471 count_df_quarter = df[["callsign_group_simplified", "quarter", "hour"]].value_counts().reset_index(name="count").sort_values(['quarter','callsign_group_simplified','hour']) 1472 count_df_quarter.to_csv("historical_data/historical_missed_calls_by_quarter_and_hour.csv", index=False) 1473 1474 def upper_allowable_time_bounds(self): 1475 """ 1476 Calculates the maximum permissable time for each phase on an incident based on supplied historical data. 1477 This is currently set to 1.5x the upper quartile of the data distribution 1478 """ 1479 1480 median_df = self.df[[ 1481 'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 1482 'time_to_hospital', 'time_to_clear', 'vehicle_type']] 1483 1484 # Replacing zeros with NaN to exclude from median calculation 1485 # since if an HEMS result is Stood down en route, then time_on_scene 1486 # would be zero and affect the median 1487 median_df.replace(0, np.nan, inplace=True) 1488 1489 print(median_df.quantile(.75)) 1490 # pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False) 1491 1492 def historical_job_durations_breakdown(self): 1493 1494 df = self.df 1495 1496 cols = [ 1497 'callsign', 'vehicle_type', 1498 'time_allocation', 'time_mobile', 1499 'time_to_scene', 'time_on_scene', 1500 'time_to_hospital', 'time_to_clear' 1501 ] 1502 df2 = df[cols].copy() 1503 1504 # 2. Add a 1-based row identifier 1505 df2['job_identifier'] = range(1, len(df2) + 1) 1506 1507 # 3. Compute total_duration as the row-wise sum of the time columns 1508 time_cols = [ 1509 'time_allocation', 'time_mobile', 1510 'time_to_scene', 'time_on_scene', 1511 'time_to_hospital', 'time_to_clear' 1512 ] 1513 df2['total_duration'] = df2[time_cols].sum(axis=1, skipna=True) 1514 1515 #print(df2.head()) 1516 1517 # 4. Pivot (melt) to long format 1518 df_long = df2.melt( 1519 id_vars=['job_identifier', 'callsign', 'vehicle_type'], 1520 value_vars=time_cols + ['total_duration'], 1521 var_name='name', 1522 value_name='value' 1523 ) 1524 1525 #print(df_long[df_long.job_identifier == 1]) 1526 1527 # 5. Drop any rows where callsign or vehicle_type is missing 1528 df_long = df_long.dropna(subset=['callsign', 'vehicle_type']) 1529 df_long_sorted = df_long.sort_values("job_identifier").reset_index(drop=True) 1530 1531 # 6. Write out to CSV 1532 df_long_sorted.to_csv("historical_data/historical_job_durations_breakdown.csv", index=False) 1533 1534 # ========== ARCHIVED CODE - v1 of calcualte_availability_row ======================== # 1535 # def calculate_availability_row(self, row, rota_df, callsign_lookup_df, period_start, period_end): 1536 # """ 1537 # Compute downtime overlap, rota-based scheduled time, and proportion for a given row. 1538 # Returns data tagged with bin, quarter, and downtime reason. 1539 # """ 1540 1541 # registration = row['aircraft'].lower() 1542 # downtime_start = pd.to_datetime(row['offline'], utc=True) 1543 # downtime_end = pd.to_datetime(row['online'], utc=True) 1544 # reason = row.get('reason', None) 1545 1546 # hour = downtime_start.hour 1547 # if 0 <= hour <= 5: 1548 # six_hour_bin = '00-05' 1549 # elif 6 <= hour <= 11: 1550 # six_hour_bin = '06-11' 1551 # elif 12 <= hour <= 17: 1552 # six_hour_bin = '12-17' 1553 # else: 1554 # six_hour_bin = '18-23' 1555 1556 # quarter = downtime_start.quarter 1557 1558 # # Match callsign 1559 # match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration] 1560 # if match.empty: 1561 # return { 1562 # 'registration': registration, 1563 # 'offline': downtime_start, 1564 # 'online': downtime_end, 1565 # 'six_hour_bin': six_hour_bin, 1566 # 'quarter': quarter, 1567 # 'total_offline': None, 1568 # 'scheduled_minutes': None, 1569 # 'reason': reason, 1570 # 'proportion': None 1571 # } 1572 # callsign = match.iloc[0]['callsign'] 1573 1574 # rota_rows = rota_df[rota_df['callsign'] == callsign] 1575 # if rota_rows.empty: 1576 # return { 1577 # 'registration': registration, 1578 # 'offline': downtime_start, 1579 # 'online': downtime_end, 1580 # 'six_hour_bin': six_hour_bin, 1581 # 'quarter': quarter, 1582 # 'total_offline': None, 1583 # 'scheduled_minutes': None, 1584 # 'reason': reason, 1585 # 'proportion': None 1586 # } 1587 1588 # # Clip evaluation window to downtime period 1589 # eval_start = max(downtime_start.normalize(), pd.to_datetime(period_start, utc=True)) 1590 # eval_end = min(downtime_end.normalize(), pd.to_datetime(period_end, utc=True)) 1591 1592 # total_scheduled_minutes = 0 1593 # total_overlap_minutes = 0 1594 1595 # current_day = eval_start 1596 # while current_day <= eval_end: 1597 # month = current_day.month 1598 # season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter' 1599 1600 # for _, rota in rota_rows.iterrows(): 1601 # start_hour = rota[f'{season}_start'] 1602 # end_hour = rota[f'{season}_end'] 1603 1604 # rota_start = current_day + timedelta(hours=start_hour) 1605 # rota_end = current_day + timedelta(hours=end_hour) 1606 # if end_hour <= start_hour: 1607 # rota_end += timedelta(days=1) 1608 1609 # # Count scheduled time regardless of overlap 1610 # scheduled_minutes = (rota_end - rota_start).total_seconds() / 60 1611 # total_scheduled_minutes += scheduled_minutes 1612 1613 # # Count overlap only if intersecting with downtime 1614 # overlap_start = max(downtime_start, rota_start) 1615 # overlap_end = min(downtime_end, rota_end) 1616 # if overlap_end > overlap_start: 1617 # overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60 1618 # total_overlap_minutes += overlap_minutes 1619 1620 # current_day += timedelta(days=1) 1621 1622 # if total_scheduled_minutes == 0: 1623 # proportion = None 1624 # else: 1625 # proportion = total_overlap_minutes / total_scheduled_minutes 1626 1627 # return { 1628 # 'registration': registration, 1629 # 'offline': downtime_start, 1630 # 'online': downtime_end, 1631 # 'six_hour_bin': six_hour_bin, 1632 # 'quarter': quarter, 1633 # 'total_offline': total_overlap_minutes, 1634 # 'scheduled_minutes': total_scheduled_minutes, 1635 # 'reason': reason, 1636 # 'proportion': proportion 1637 # } 1638 1639 1640 # ============= ARCHIVED CODE - v2 of calculate_availability_row ======================= # 1641 # def calculate_availability_row(self, row, rota_df, callsign_lookup_df): 1642 # """ 1643 # Compute downtime overlap, rota-based scheduled time, and proportion for a given row. 1644 # Returns data tagged with bin, quarter, and downtime reason. 1645 # """ 1646 1647 # registration = row['aircraft'].lower() 1648 # downtime_start = pd.to_datetime(row['offline'], utc=True) 1649 # downtime_end = pd.to_datetime(row['online'], utc=True) 1650 # reason = row.get('reason', None) 1651 1652 # hour = downtime_start.hour 1653 # if 0 <= hour <= 5: 1654 # six_hour_bin = '00-05' 1655 # elif 6 <= hour <= 11: 1656 # six_hour_bin = '06-11' 1657 # elif 12 <= hour <= 17: 1658 # six_hour_bin = '12-17' 1659 # else: 1660 # six_hour_bin = '18-23' 1661 1662 # quarter = downtime_start.quarter 1663 1664 # # Match callsign 1665 # match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration] 1666 # if match.empty: 1667 # return { 1668 # 'registration': registration, 1669 # 'offline': downtime_start, 1670 # 'online': downtime_end, 1671 # 'six_hour_bin': six_hour_bin, 1672 # 'quarter': quarter, 1673 # 'total_offline': None, 1674 # 'scheduled_minutes': None, 1675 # 'reason': reason, 1676 # 'proportion': None 1677 # } 1678 # callsign = match.iloc[0]['callsign'] 1679 1680 # rota_rows = rota_df[rota_df['callsign'] == callsign] 1681 # if rota_rows.empty: 1682 # return { 1683 # 'registration': registration, 1684 # 'offline': downtime_start, 1685 # 'online': downtime_end, 1686 # 'six_hour_bin': six_hour_bin, 1687 # 'quarter': quarter, 1688 # 'total_offline': None, 1689 # 'scheduled_minutes': None, 1690 # 'reason': reason, 1691 # 'proportion': None 1692 # } 1693 1694 # month = downtime_start.month 1695 # season = 'summer' if month in [4, 5, 6, 7, 8, 9] else 'winter' 1696 1697 # total_scheduled_minutes = 0 1698 # total_overlap_minutes = 0 1699 1700 # for _, rota in rota_rows.iterrows(): 1701 # start_hour = rota[f'{season}_start'] 1702 # end_hour = rota[f'{season}_end'] 1703 1704 # for base_day in [downtime_start.normalize() - timedelta(days=1), 1705 # downtime_start.normalize(), 1706 # downtime_start.normalize() + timedelta(days=1)]: 1707 1708 # rota_start = base_day + timedelta(hours=start_hour) 1709 # rota_end = base_day + timedelta(hours=end_hour) 1710 # if end_hour <= start_hour: 1711 # rota_end += timedelta(days=1) 1712 1713 # overlap_start = max(downtime_start, rota_start) 1714 # overlap_end = min(downtime_end, rota_end) 1715 1716 # if overlap_end > overlap_start: 1717 # scheduled_minutes = (rota_end - rota_start).total_seconds() / 60 1718 # overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60 1719 1720 # total_scheduled_minutes += scheduled_minutes 1721 # total_overlap_minutes += overlap_minutes 1722 1723 # if total_scheduled_minutes == 0: 1724 # proportion = None 1725 # else: 1726 # proportion = total_overlap_minutes / total_scheduled_minutes 1727 1728 # return { 1729 # 'registration': registration, 1730 # 'offline': downtime_start, 1731 # 'online': downtime_end, 1732 # 'six_hour_bin': six_hour_bin, 1733 # 'quarter': quarter, 1734 # 'total_offline': total_overlap_minutes, 1735 # 'scheduled_minutes': total_scheduled_minutes, 1736 # 'reason': reason, 1737 # 'proportion': proportion 1738 # } 1739 1740 # ================ ARCHIVED CODE - ad-hoc unavailability calculation ================ # 1741 # def ad_hoc_unavailability(self, period_start, period_end, include_debugging_cols=False): 1742 # """Process ad hoc unavailability records into a stratified probability table. 1743 1744 # Calculates the probability of ad hoc unavailability and availability based 1745 # on historical data. The data is stratified by aircraft registration, 1746 # six-hour time bins, and calendar quarters. It ensures all standard 1747 # reasons ('available', 'crew', 'weather', 'aircraft') are present for 1748 # each combination. It filters out any registration/quarter/bin pairings 1749 # with little scheduled time, marking their probabilities as blank. It also adds 1750 # a count of the ad-hoc unavailability events considered for each probability calculation 1751 # and the total scheduled time for the resource in that quarter/time period that was part of 1752 # the calculations. 1753 1754 # Returns 1755 # ------- 1756 # None 1757 # This function does not return a value but saves the calculated 1758 # probabilities to 'distribution_data/ad_hoc_unavailability.csv'. 1759 # The CSV file includes columns for registration, six_hour_bin, 1760 # quarter, reason, probability, and count. 1761 1762 # Notes 1763 # ----- 1764 # This function relies on the following input files: 1765 # - 'external_data/ad_hoc.csv': Contains ad hoc unavailability records. 1766 # - 'actual_data/HEMS_ROTA.csv': Contains rota information. 1767 # - 'actual_data/callsign_registration_lookup.csv': Maps callsigns to registrations. 1768 # It also depends on the 'calculate_availability_row' method within the 1769 # same class. Ensure that the 'distribution_data' directory exists. 1770 # """ 1771 # try: 1772 # # Load data 1773 # adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online']) 1774 # adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']] 1775 # rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv") 1776 # callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv") 1777 1778 # # Process each ad hoc record 1779 # results = adhoc_df.apply( 1780 # lambda row: self.calculate_availability_row(row, rota_df, callsign_lookup_df, period_start, period_end), 1781 # axis=1 1782 # ) 1783 # final_df = pd.DataFrame(results.tolist()) 1784 # final_df.to_csv("external_data/ad_hoc_intermediate.csv") 1785 1786 # # Check if final_df is empty before proceeding 1787 # if final_df.empty: 1788 # print("No ad-hoc data processed, skipping file generation.") 1789 # return 1790 1791 # # Define the full set of reasons expected 1792 # all_reasons = ['available', 'crew', 'weather', 'aircraft'] 1793 1794 # # --- Aggregate Data --- 1795 # # Calculate job count per registration, quarter, AND bin 1796 # unavailability_instance_counts = final_df.groupby(['registration', 'quarter', 'six_hour_bin']).size().reset_index(name='unavailability_instance_counts') 1797 1798 # # Downtime by bin + quarter + reason (only for unavailability reasons) 1799 # grouped = final_df[final_df['reason'].isin(['crew', 'weather', 'aircraft'])] 1800 # grouped = grouped.groupby(['registration', 'six_hour_bin', 'quarter', 'reason'])['total_offline'].sum().reset_index() 1801 1802 # # Scheduled time by bin + quarter 1803 # scheduled_totals = final_df.groupby(['registration', 'six_hour_bin', 'quarter'])['scheduled_minutes'].sum().reset_index() 1804 # scheduled_totals = scheduled_totals.rename(columns={'scheduled_minutes': 'total_scheduled'}) 1805 1806 # # Merge job counts into scheduled totals 1807 # scheduled_totals = pd.merge(scheduled_totals, unavailability_instance_counts, on=['registration', 'quarter', 'six_hour_bin'], how='left') 1808 1809 # # Calculate total downtime per bin + quarter (for 'available' calculation) 1810 # downtime_totals = grouped.groupby(['registration','six_hour_bin', 'quarter'])['total_offline'].sum().reset_index() 1811 # downtime_totals = downtime_totals.rename(columns={'total_offline': 'total_downtime'}) 1812 1813 # # --- Create Full Grid --- 1814 # # Get all unique combinations of registration, quarter, and bin 1815 # unique_bins = scheduled_totals[['registration', 'quarter', 'six_hour_bin']].drop_duplicates() 1816 1817 # # Check for empty unique_bins 1818 # if unique_bins.empty: 1819 # print("No valid unique bins found, skipping file generation.") 1820 # return 1821 1822 # # Create the full grid by crossing unique bins with all reasons 1823 # full_grid = unique_bins.assign(key=1).merge(pd.DataFrame({'reason': all_reasons, 'key': 1}), on='key').drop('key', axis=1) 1824 1825 # # --- Merge Data into Full Grid --- 1826 # full_grid = pd.merge(full_grid, scheduled_totals, on=['registration', 'quarter', 'six_hour_bin'], how='left') 1827 # full_grid = pd.merge(full_grid, grouped, on=['registration', 'six_hour_bin', 'quarter', 'reason'], how='left') 1828 # full_grid = pd.merge(full_grid, downtime_totals, on=['registration', 'six_hour_bin', 'quarter'], how='left') 1829 1830 # # Fill NaNs created during merges 1831 # full_grid['total_offline'] = full_grid['total_offline'].fillna(0) 1832 # full_grid['total_downtime'] = full_grid['total_downtime'].fillna(0) 1833 # full_grid['unavailability_instance_counts'] = full_grid['unavailability_instance_counts'].fillna(0) # Fill job count with 0 for bins that might exist but have no jobs 1834 1835 # # --- Calculate Probabilities --- 1836 # # Suppress division by zero warnings - we handle these next 1837 # with np.errstate(divide='ignore', invalid='ignore'): 1838 # prob_avail = (full_grid['total_scheduled'] - full_grid['total_downtime']) / full_grid['total_scheduled'] 1839 # prob_unavail = full_grid['total_offline'] / full_grid['total_scheduled'] 1840 1841 # full_grid['probability'] = np.where( 1842 # full_grid['reason'] == 'available', 1843 # prob_avail, 1844 # prob_unavail 1845 # ) 1846 1847 # # Handle NaN/Inf from division by zero, set them to 0.0 for now. 1848 # full_grid['probability'] = full_grid['probability'].replace([np.inf, -np.inf], np.nan).fillna(0.0) 1849 1850 # # --- Apply Threshold and Blanking --- 1851 # # Condition for setting probability to blank 1852 # condition_for_blank = (full_grid['total_scheduled'] < 60 * 2 * 30 * 3 ) # If less than 2 hours per day in time period rotad, exclude 1853 1854 # # Convert probability to object to allow blanks, then apply the condition 1855 # full_grid['probability'] = full_grid['probability'].astype(object) 1856 # full_grid.loc[condition_for_blank, 'probability'] = '' 1857 1858 # # --- Finalize and Save --- 1859 # # Select and rename columns 1860 # if include_debugging_cols: 1861 # final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability', 'unavailability_instance_counts', 'total_offline', 'total_scheduled']] 1862 # final_prob_df.rename(columns={"total_offine":"total_minutes_offline_for_reason", "total_scheduled": "total_minutes_rotad_availability_in_quarter_and_time_period"}) 1863 # else: 1864 # final_prob_df = full_grid[['registration', 'six_hour_bin', 'quarter', 'reason', 'probability']] 1865 # # final_prob_df = final_prob_df.rename(columns={'job_count': 'count'}) 1866 1867 # # Sort and save 1868 # final_prob_df = final_prob_df.sort_values(by=['registration', 'quarter', 'six_hour_bin', 'reason']).reset_index(drop=True) 1869 # final_prob_df.to_csv("distribution_data/ad_hoc_unavailability.csv", index=False) 1870 1871 # print("Ad-hoc unavailability probability table generated successfully.") 1872 1873 # except FileNotFoundError: 1874 # print("Couldn't generate ad-hoc unavailability due to missing file(s). " 1875 # "Please ensure 'external_data/ad_hoc.csv', " 1876 # "'actual_data/HEMS_ROTA.csv', and " 1877 # "'actual_data/callsign_registration_lookup.csv' exist.") 1878 # pass 1879 # except Exception as e: 1880 # print(f"An error occurred during ad-hoc unavailability processing: {e}") 1881 # pass 1882 1883 def ad_hoc_unavailability(self, period_start, period_end): 1884 """ 1885 Calculate aircraft availability and unavailability probabilities based on scheduled rotas and ad-hoc downtime. 1886 1887 Args: 1888 period_start (str/datetime): Start date for analysis period 1889 period_end (str/datetime): End date for analysis period 1890 include_debugging_cols (bool): Whether to include debugging columns in output (currently unused) 1891 1892 Returns: 1893 None (saves results to CSV file) 1894 """ 1895 # Load and prepare ad-hoc downtime data 1896 adhoc_df = pd.read_csv('external_data/ad_hoc.csv', parse_dates=['offline', 'online']) 1897 adhoc_df = adhoc_df[['aircraft', 'offline', 'online', 'reason']] 1898 # Load rota and callsign lookup data 1899 rota_df = pd.read_csv("actual_data/HEMS_ROTA.csv") 1900 callsign_lookup_df = pd.read_csv("actual_data/callsign_registration_lookup.csv") 1901 # Merge rota with callsign lookup to get registration numbers to allow matching with 1902 # ad-hoc data, which uses registrations 1903 full_rota_df = rota_df.merge(callsign_lookup_df, on="callsign") 1904 1905 # Define the hour bands mapping 1906 HOUR_BANDS = { 1907 '00-05': (0, 6), # 00:00 to 05:59 1908 '06-11': (6, 12), # 06:00 to 11:59 1909 '12-17': (12, 18), # 12:00 to 17:59 1910 '18-23': (18, 24) # 18:00 to 23:59 1911 } 1912 1913 # Create list of 6-hour bins 1914 bins = ['00-05', '06-11', '12-17', '18-23'] 1915 1916 def is_summer(date_obj, summer_start_month=4, summer_end_month=9): 1917 """ 1918 Determine if a date falls in summer months (April-September). 1919 1920 Args: 1921 date_obj: Date object to check 1922 1923 Returns: 1924 bool: True if date is in summer months 1925 """ 1926 return date_obj.month in [i for i in range(summer_start_month, summer_end_month+1)] 1927 1928 def check_month_is_summer(month, summer_start_month=4, summer_end_month=9): 1929 """ 1930 Determine if a date falls in summer months (April-September). 1931 1932 Args: 1933 month: Integer month 1934 1935 Returns: 1936 str: 'summer' is in summer months, 'winter' if in winter months 1937 """ 1938 return 'summer' if month in [i for i in range(summer_start_month, summer_end_month+1)] else 'winter' 1939 1940 def get_band_for_hour(hour): 1941 """ 1942 Return the 6-hour band name for a given hour (0-23). 1943 1944 Args: 1945 hour (int): Hour of day (0-23) 1946 1947 Returns: 1948 str or None: Band name or None if hour is invalid 1949 """ 1950 for band_name, (start, end) in HOUR_BANDS.items(): 1951 if start <= hour < end: 1952 return band_name 1953 return None 1954 1955 def calculate_minutes_in_band(shift_start, shift_end, band_start, band_end): 1956 """ 1957 Calculate how many minutes of a shift fall within a specific time band. 1958 1959 Args: 1960 shift_start (float): Shift start time in hours (0-23.99) 1961 shift_end (float): Shift end time in hours (0-23.99) 1962 band_start (int): Band start hour 1963 band_end (int): Band end hour 1964 1965 Returns: 1966 int: Minutes of overlap between shift and band 1967 """ 1968 # Find the overlap between shift and band 1969 overlap_start = max(shift_start, band_start) 1970 overlap_end = min(shift_end, band_end) 1971 1972 if overlap_start < overlap_end: 1973 return int((overlap_end - overlap_start) * 60) # Convert to minutes 1974 return 0 1975 1976 # Create date range for analysis period 1977 date_range = pd.date_range(start=period_start, 1978 end=period_end, 1979 freq='D') 1980 daily_df = pd.DataFrame({'date': date_range}) 1981 daily_df['date'] = pd.to_datetime(daily_df['date']) 1982 1983 # Create MultiIndex from date and bins combinations to get all date/time combinations 1984 multi_index = pd.MultiIndex.from_product( 1985 [daily_df['date'], bins], 1986 names=['date', 'six_hour_bin'] 1987 ) 1988 1989 # Convert MultiIndex to DataFrame and reset index 1990 daily_df = multi_index.to_frame(index=False).reset_index(drop=True) 1991 # Add quarter information for seasonal analysis 1992 daily_df["quarter"] = daily_df.date.dt.quarter 1993 1994 # Initialize availability columns for each aircraft registration 1995 # Each column will store minutes of scheduled time per time band 1996 for registration in full_rota_df['registration'].unique(): 1997 daily_df[registration] = 0 # Initialize with 0 minutes 1998 1999 # Calculate scheduled availability for each date/time band combination 2000 for _, row in daily_df.iterrows(): 2001 current_date = row['date'] 2002 current_band = row['six_hour_bin'] 2003 band_start, band_end = HOUR_BANDS[current_band] 2004 2005 is_current_date_summer = is_summer(current_date) 2006 2007 # Get the row index for updating the dataframe 2008 row_idx = daily_df[(daily_df['date'] == current_date) & 2009 (daily_df['six_hour_bin'] == current_band)].index[0] 2010 2011 # Iterate through each resource's rota entry 2012 for _, rota_entry in full_rota_df.iterrows(): 2013 registration = rota_entry['registration'] 2014 # Select appropriate start/end times based on season 2015 start_hour_col = 'summer_start' if is_current_date_summer else 'winter_start' 2016 end_hour_col = 'summer_end' if is_current_date_summer else 'winter_end' 2017 start_hour = rota_entry[start_hour_col] 2018 end_hour = rota_entry[end_hour_col] 2019 2020 total_minutes_for_band = 0 2021 2022 if start_hour < end_hour: 2023 # Shift within same day 2024 total_minutes_for_band = calculate_minutes_in_band( 2025 start_hour, end_hour, band_start, band_end 2026 ) 2027 elif start_hour > end_hour: 2028 # Shift spans midnight - check both parts 2029 2030 # Part 1: Today from start_hour to midnight 2031 if band_end <= 24: # This band is today 2032 total_minutes_for_band += calculate_minutes_in_band( 2033 start_hour, 24, band_start, band_end 2034 ) 2035 2036 # Part 2: Tomorrow from midnight to end_hour 2037 # Need to check if this band is for tomorrow 2038 tomorrow = current_date + pd.Timedelta(days=1) 2039 tomorrow_rows = daily_df[daily_df['date'] == tomorrow] 2040 2041 if not tomorrow_rows.empty and current_band in tomorrow_rows['six_hour_bin'].values: 2042 total_minutes_for_band += calculate_minutes_in_band( 2043 0, end_hour, band_start, band_end 2044 ) 2045 2046 # Update the scheduled time for this aircraft in this time band 2047 daily_df.loc[row_idx, registration] += total_minutes_for_band 2048 2049 # Aggregate scheduled availability by quarter, time band, and registration 2050 available_time = ( 2051 daily_df.melt(id_vars=["date", "six_hour_bin", "quarter"], value_name="rota_time", var_name="registration") 2052 .groupby(["quarter", "six_hour_bin", "registration"])[["rota_time"]] 2053 .sum().reset_index() 2054 ) 2055 2056 def calculate_availability_row(row, rota_df, callsign_lookup_df): 2057 """ 2058 Calculate downtime overlap with scheduled rota for a single downtime event. 2059 2060 Args: 2061 row: DataFrame row containing downtime information 2062 rota_df: DataFrame with rota schedules 2063 callsign_lookup_df: DataFrame mapping callsigns to registrations 2064 2065 Returns: 2066 dict: Dictionary containing processed availability data 2067 """ 2068 # Extract downtime information 2069 registration = row['aircraft'].lower() 2070 downtime_start = pd.to_datetime(row['offline'], utc=True) 2071 downtime_end = pd.to_datetime(row['online'], utc=True) 2072 reason = row.get('reason', None) 2073 2074 # Determine which 6-hour bin this downtime starts in 2075 hour = downtime_start.hour 2076 if 0 <= hour <= 5: 2077 six_hour_bin = '00-05' 2078 elif 6 <= hour <= 11: 2079 six_hour_bin = '06-11' 2080 elif 12 <= hour <= 17: 2081 six_hour_bin = '12-17' 2082 else: 2083 six_hour_bin = '18-23' 2084 2085 quarter = downtime_start.quarter 2086 2087 # Find the callsign for this registration 2088 match = callsign_lookup_df[callsign_lookup_df['registration'].str.lower() == registration] 2089 2090 # No matching callsign found 2091 if match.empty: 2092 return { 2093 'registration': registration, 2094 'offline': downtime_start, 2095 'online': downtime_end, 2096 'six_hour_bin': six_hour_bin, 2097 'quarter': quarter, 2098 'total_offline': None, 2099 'reason': reason 2100 } 2101 2102 callsign = match.iloc[0]['callsign'] 2103 # Find rota entries for this callsign 2104 rota_rows = rota_df[rota_df['callsign'] == callsign] 2105 if rota_rows.empty: 2106 return { 2107 'registration': registration, 2108 'offline': downtime_start, 2109 'online': downtime_end, 2110 'six_hour_bin': six_hour_bin, 2111 'quarter': quarter, 2112 'total_offline': None, 2113 'reason': reason 2114 } 2115 2116 # Determine season for appropriate rota times 2117 month = downtime_start.month 2118 season = check_month_is_summer(month) 2119 2120 total_overlap_minutes = 0 2121 2122 # Calculate overlap between downtime and scheduled rota times 2123 for _, rota in rota_rows.iterrows(): 2124 start_hour = rota[f'{season}_start'] 2125 end_hour = rota[f'{season}_end'] 2126 2127 # Check overlap across multiple days (yesterday, today, tomorrow) 2128 # This handles shifts that span midnight 2129 for base_day in [downtime_start.normalize() - timedelta(days=1), 2130 downtime_start.normalize(), 2131 downtime_start.normalize() + timedelta(days=1)]: 2132 2133 rota_start = base_day + timedelta(hours=start_hour) 2134 rota_end = base_day + timedelta(hours=end_hour) 2135 2136 # Handle shifts that cross midnight 2137 if end_hour <= start_hour: 2138 rota_end += timedelta(days=1) 2139 # Calculate overlap between downtime and this rota period 2140 overlap_start = max(downtime_start, rota_start) 2141 overlap_end = min(downtime_end, rota_end) 2142 2143 if overlap_end > overlap_start: 2144 overlap_minutes = (overlap_end - overlap_start).total_seconds() / 60 2145 2146 total_overlap_minutes += overlap_minutes 2147 2148 return { 2149 'registration': registration, 2150 'offline': downtime_start, 2151 'online': downtime_end, 2152 'six_hour_bin': six_hour_bin, 2153 'quarter': quarter, 2154 'total_offline': total_overlap_minutes, 2155 'reason': reason 2156 } 2157 2158 # Process all ad-hoc downtime events 2159 results = adhoc_df.apply( 2160 lambda row: calculate_availability_row( 2161 row, rota_df, callsign_lookup_df), 2162 axis=1 2163 ) 2164 2165 # Convert results to DataFrame and select relevant columns 2166 unavailability_minutes_df = ( 2167 pd.DataFrame(results.tolist()) 2168 [["registration", "six_hour_bin", "quarter", "total_offline", "reason"]] 2169 ) 2170 2171 # Aggregate offline time by registration, time band, quarter, and reason 2172 offline_durations = unavailability_minutes_df.groupby( 2173 ["registration", "six_hour_bin", "quarter", "reason"] 2174 )[["total_offline"]].sum().reset_index() 2175 2176 # Create complete combinations to ensure all possible categories are represented 2177 # This prevents missing data issues in the final output 2178 registrations = offline_durations['registration'].unique() 2179 six_hour_bins = offline_durations['six_hour_bin'].unique() 2180 quarters = offline_durations['quarter'].unique() 2181 reasons = offline_durations['reason'].unique() 2182 2183 # Generate all possible combinations 2184 all_combinations = list(itertools.product(registrations, six_hour_bins, quarters, reasons)) 2185 2186 # Create complete dataframe with all combinations 2187 complete_df = pd.DataFrame( 2188 all_combinations, 2189 columns=['registration', 'six_hour_bin', 'quarter', 'reason'] 2190 ) 2191 2192 # Merge with original data to preserve existing values, fill missing with 0 2193 offline_durations = complete_df.merge( 2194 offline_durations, 2195 on=['registration', 'six_hour_bin', 'quarter', 'reason'], 2196 how='left' 2197 ) 2198 2199 # Fill NaN values with 0 (no downtime for those combinations) 2200 offline_durations['total_offline'] = offline_durations['total_offline'].fillna(0.0) 2201 2202 # Sort for better readability 2203 offline_durations = offline_durations.sort_values( 2204 ['registration', 'six_hour_bin', 'quarter', 'reason'] 2205 ).reset_index(drop=True) 2206 2207 # Ensure consistent case for registration names 2208 available_time["registration"] = available_time["registration"].str.lower() 2209 offline_durations["registration"] = offline_durations["registration"].str.lower() 2210 2211 # Merge scheduled time with downtime data 2212 ad_hoc = available_time.merge(offline_durations, on=["registration", "quarter", "six_hour_bin"]) 2213 ad_hoc["probability"] = ad_hoc["total_offline"] / ad_hoc["rota_time"] 2214 2215 # Calculate availability probability (1 - sum of all unavailability probabilities) 2216 available_prop_df = ad_hoc.groupby( 2217 ["quarter", "six_hour_bin", "registration", "rota_time"] 2218 )[["probability"]].sum().reset_index() 2219 2220 available_prop_df["reason"] = "available" 2221 available_prop_df["probability"] = 1 - available_prop_df["probability"] 2222 2223 # Combine unavailability and availability data 2224 final_ad_hoc_df = ( 2225 pd.concat([ad_hoc, available_prop_df]) 2226 .sort_values(["quarter", "six_hour_bin", "registration", "reason"]) 2227 ) 2228 2229 # Handle cases where there's no scheduled time (set probability to NaN) 2230 final_ad_hoc_df["probability"] = final_ad_hoc_df.apply( 2231 lambda x: np.nan if x["rota_time"]==0 else x["probability"], 2232 axis=1 2233 ) 2234 2235 final_ad_hoc_df["probability"] = final_ad_hoc_df["probability"].round(5) 2236 2237 # Save results to CSV 2238 final_ad_hoc_df[["registration", "six_hour_bin", "quarter", "reason", "probability"]].to_csv( 2239 "distribution_data/ad_hoc_unavailability.csv", index=False 2240 ) 2241 2242 2243 def run_sim_on_historical_params(self): 2244 # Ensure all rotas are using default values 2245 rota = pd.read_csv("tests/rotas_historic/HISTORIC_HEMS_ROTA.csv") 2246 rota.to_csv("actual_data/HEMS_ROTA.csv", index=False) 2247 2248 callsign_reg_lookup = pd.read_csv("tests/rotas_historic/HISTORIC_callsign_registration_lookup.csv") 2249 callsign_reg_lookup.to_csv("actual_data/callsign_registration_lookup.csv", index=False) 2250 2251 service_history = pd.read_csv("tests/rotas_historic/HISTORIC_service_history.csv") 2252 service_history.to_csv("actual_data/service_history.csv", index=False) 2253 2254 service_sched = pd.read_csv("tests/rotas_historic/HISTORIC_service_schedules_by_model.csv") 2255 service_sched.to_csv("actual_data/service_schedules_by_model.csv", index=False) 2256 2257 print("Generating simulation results...") 2258 removeExistingResults() 2259 2260 total_runs = 30 2261 sim_years = 2 2262 sim_duration = 60 * 24 * 7 * 52 * sim_years 2263 2264 parallelProcessJoblib( 2265 total_runs=total_runs, 2266 sim_duration=sim_duration, 2267 warm_up_time=0, 2268 sim_start_date=datetime.strptime("2023-01-01 05:00:00", "%Y-%m-%d %H:%M:%S"), 2269 amb_data=False, 2270 print_debug_messages=True 2271 ) 2272 2273 collateRunResults() 2274 2275 try: 2276 results_all_runs = pd.read_csv("data/run_results.csv") 2277 # results_all_runs.to_csv("historical_data/calculated/SIM_hist_params.csv", index=False) 2278 2279 # save data of counts of suboptimal care category sent 2280 counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", 'hems_res_category', "care_cat"]].value_counts().reset_index() 2281 2282 counts_df_summary = counts_df.groupby(["hems_res_category", "care_cat"])["count"].agg(["mean", "min", "max"]).reset_index() 2283 2284 counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_care_cat_sent_summary.csv") 2285 2286 # save data of counts of suboptimal vehicle type sent 2287 counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", "vehicle_type", "heli_benefit"]].value_counts().reset_index() 2288 2289 counts_df_summary = counts_df.groupby(["vehicle_type", "heli_benefit"])["count"].agg(["mean", "min", "max"]).reset_index() 2290 2291 counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_vehicle_type_sent_summary.csv") 2292 2293 2294 # # Also run the model to get some base-case outputs 2295 # resource_requests = ( 2296 # results_all_runs[results_all_runs["event_type"] == "resource_request_outcome"] 2297 # .copy() 2298 # ) 2299 2300 # resource_requests["care_cat"] = ( 2301 # resource_requests.apply(lambda x: "REG - Helicopter Benefit" if x["heli_benefit"]=="y" 2302 # and x["care_cat"]=="REG" else x["care_cat"], 2303 # axis=1) 2304 # ) 2305 2306 # missed_jobs_care_cat_summary = ( 2307 # resource_requests[["care_cat", "time_type"]].value_counts().reset_index(name="jobs") 2308 # .sort_values(["care_cat", "time_type"]) 2309 # .copy() 2310 # ) 2311 2312 # missed_jobs_care_cat_summary["jobs_average"] = ( 2313 # missed_jobs_care_cat_summary["jobs"]/ 2314 # total_runs 2315 # ) 2316 2317 # missed_jobs_care_cat_summary["jobs_per_year_average"] = ( 2318 # (missed_jobs_care_cat_summary["jobs_average"] / float(sim_years*365)*365) 2319 # ).round(0) 2320 2321 missed_jobs_care_cat_summary = _job_outcome_calculation.get_missed_call_df( 2322 results_all_runs=results_all_runs, 2323 run_length_days = float(sim_years*365), 2324 what="summary" 2325 ) 2326 2327 missed_jobs_care_cat_summary.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_summary.csv") 2328 2329 2330 missed_jobs_care_cat_breakdown = _job_outcome_calculation.get_missed_call_df( 2331 results_all_runs=results_all_runs, 2332 run_length_days = float(sim_years*365), 2333 what="breakdown" 2334 ) 2335 2336 missed_jobs_care_cat_breakdown.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_breakdown.csv") 2337 2338 except FileNotFoundError: 2339 pass
The DistributionFitUtils classa
This class will import a CSV, undertake some light wrangling and then determine distributions and probabilities required for the Discrete Event Simulation
example usage: my_data = DistributionFitUtils('data/my_data.csv') my_data.import_and_wrangle()
36 def __init__(self, file_path: str, calculate_school_holidays = False, school_holidays_years = 0): 37 38 self.file_path = file_path 39 self.df = pd.DataFrame() 40 41 # The number of additional years of school holidays 42 # that will be calculated over that maximum date in the provided dataset 43 self.school_holidays_years = school_holidays_years 44 self.calculate_school_holidays = calculate_school_holidays 45 46 self.times_to_fit = [ 47 {"hems_result": "Patient Treated but not conveyed by HEMS", 48 "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']}, 49 {"hems_result": "Patient Conveyed by HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']}, 50 {"hems_result": "Patient Conveyed by land with HEMS" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 'time_to_clear']}, 51 {"hems_result": "Stand Down" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_clear']}, 52 {"hems_result": "Landed but no patient contact" , "times_to_fit" : ['time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_clear']}, 53 ] 54 55 self.sim_tools_distr_plus = [ 56 "poisson", 57 "bernoulli", 58 "triang", 59 "erlang", 60 "weibull_min", 61 "expon_weib", 62 "betabinom", 63 "pearson3", 64 "cauchy", 65 "chi2", 66 "expon", 67 "exponpow", 68 "gamma", 69 "lognorm", 70 "norm", 71 "powerlaw", 72 "rayleigh", 73 "uniform", 74 "neg_binomial", 75 "zip" 76 ] 77 # SR 16-04-2025 Have hardcoded the common distributions 78 # to make setup for random number generation more robust 79 #+ get_common_distributions()
81 def removeExistingResults(self, folder: str) -> None: 82 """ 83 Removes results from previous fitting 84 """ 85 86 matching_files = glob.glob(os.path.join(folder, "*.*")) 87 88 print(matching_files) 89 90 for file in matching_files: 91 os.remove(file)
Removes results from previous fitting
94 def getBestFit(self, q_times, distr=get_common_distributions(), show_summary=False): 95 """ 96 97 Convenience function for Fitter. 98 Returns model and parameters that is considered 99 the 'best fit'. 100 101 TODO: Determine how Fitter works this out 102 103 """ 104 105 if(q_times.size > 0): 106 if(len(distr) > 0): 107 f = Fitter(q_times, timeout=60, distributions=distr) 108 else: 109 f = Fitter(q_times, timeout=60) 110 f.fit() 111 if show_summary == True: 112 f.summary() 113 return f.get_best() 114 else: 115 return {}
Convenience function for Fitter. Returns model and parameters that is considered the 'best fit'.
TODO: Determine how Fitter works this out
118 def import_and_wrangle(self): 119 """ 120 121 Function to import CSV, add additional columns that are required 122 and then sequentially execute other class functions to generate 123 the probabilities and distributions required for the DES. 124 125 TODO: Additional logic is required to check the imported CSV 126 for missing values, incorrect columns names etc. 127 128 """ 129 130 try: 131 df = pd.read_csv(self.file_path, quoting=QUOTE_ALL) 132 self.df = df 133 134 # Perhaps run some kind of checking function here. 135 136 except FileNotFoundError: 137 print(f"Cannot locate that file") 138 139 # If everything is okay, crack on... 140 self.df['inc_date'] = pd.to_datetime(self.df['inc_date']) 141 self.df['date_only'] = pd.to_datetime(df['inc_date'].dt.date) 142 self.df['hour'] = self.df['inc_date'].dt.hour # Hour of the day 143 self.df['day_of_week'] = self.df['inc_date'].dt.day_name() # Day of the week (e.g., Monday) 144 self.df['month'] = self.df['inc_date'].dt.month 145 self.df['quarter'] = self.df['inc_date'].dt.quarter 146 self.df['first_day_of_month'] = self.df['inc_date'].to_numpy().astype('datetime64[M]') 147 148 # Replacing a upper quartile limit on job cycle times and 149 # instead using a manually specified time frame. 150 # This has the advantage of allowing manual amendment of the falues 151 # on the front-end 152 # self.max_values_df = self.upper_allowable_time_bounds() 153 self.min_max_values_df = pd.read_csv('actual_data/upper_allowable_time_bounds.csv') 154 #print(self.min_max_values_df) 155 156 #This will be needed for other datasets, but has already been computed for DAA 157 #self.df['ampds_card'] = self.df['ampds_code'].str[:2] 158 159 self.removeExistingResults(Utils.HISTORICAL_FOLDER) 160 self.removeExistingResults(Utils.DISTRIBUTION_FOLDER) 161 162 #get proportions of AMPDS card by hour of day 163 self.hour_by_ampds_card_probs() 164 165 # Determine 'best' distributions for time-based data 166 self.activity_time_distributions() 167 168 # Calculate probability patient will be female based on AMPDS card 169 self.sex_by_ampds_card_probs() 170 171 # Determine 'best' distributions for age ranges straitifed by AMPDS card 172 self.age_distributions() 173 174 # Alternative approach to IA times. Start with probabilty of call at given hour stratified by quarter 175 self.hourly_arrival_by_qtr_probs() 176 177 # Calculates the mean and standard deviation of the number of incidents per day stratified by quarter 178 self.incidents_per_day() 179 self.incidents_per_day_samples() 180 181 # Calculate probability of enhanced or critical care being required based on AMPDS card 182 self.enhanced_or_critical_care_by_ampds_card_probs() 183 184 # Calculate HEMS result 185 self.hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs() 186 187 # Calculate probability of callsign being allocated to a job based on AMPDS card and hour of day 188 # self.callsign_group_by_ampds_card_and_hour_probs() 189 # self.callsign_group_by_ampds_card_probs() 190 # self.callsign_group_by_care_category() 191 self.callsign_group() 192 193 # Calculate probability of a particular vehicle type based on callsign group and month of year 194 # self.vehicle_type_by_month_probs() 195 self.vehicle_type_by_quarter_probs() 196 # self.vehicle_type_probs() # Similar to previous but without monthly stratification since ad hoc unavailability should account for this. 197 198 # Calculate the patient outcome (conveyed, deceased, unknown) 199 self.patient_outcome_by_care_category_and_quarter_probs() 200 201 # ============= ARCHIVED CODE ================= # 202 # Calculate the mean inter-arrival times stratified by yearly quarter and hour of day 203 # self.inter_arrival_times() 204 # ============= END ARCHIVED CODE ================= # 205 206 207 # ============= ARCHIVED CODE ================= # 208 # Calculate probably of patient outcome 209 # Note - this still needs to be run to support another one? 210 # Calculate probability of a specific patient outcome being allocated to a job based on HEMS result and callsign 211 # self.pt_outcome_by_hems_result_and_care_category_probs() 212 # ============= END ARCHIVED CODE ================= # 213 214 # ============= ARCHIVED CODE ================= # 215 # self.hems_results_by_patient_outcome_and_quarter_and_vehicle_type_and_callsign_group_probs() 216 # ============= END ARCHIVED CODE ================= # 217 218 # ============= ARCHIVED CODE ================= # 219 # Calculate probabily of HEMS result being allocated to a job based on callsign and hour of day 220 # self.hems_result_by_callsign_group_and_vehicle_type_probs() 221 # ============= END ARCHIVED CODE ================= # 222 223 # ============= ARCHIVED CODE ================= # 224 # Calculate probability of HEMS result being allocated to a job based on care category and helicopter benefit 225 # self.hems_result_by_care_cat_and_helicopter_benefit_probs() 226 # ============= END ARCHIVED CODE ================= # 227 228 229 # Calculate school holidays since servicing schedules typically avoid these dates 230 if self.calculate_school_holidays: 231 self.school_holidays() 232 233 # Calculate historical data 234 self.historical_monthly_totals() 235 self.historical_monthly_totals_by_callsign() 236 self.historical_monthly_totals_by_day_of_week() 237 self.historical_median_time_of_activities_by_month_and_resource_type() 238 self.historical_monthly_totals_by_hour_of_day() 239 self.historical_monthly_resource_utilisation() 240 self.historical_monthly_totals_all_calls() 241 self.historical_daily_calls_breakdown() 242 self.historical_job_durations_breakdown() 243 self.historical_missed_jobs() 244 self.historical_jobs_per_day_per_callsign() 245 self.historical_care_cat_counts() 246 247 # Calculate proportions of ad hoc unavailability 248 try: 249 # self.ad_hoc_unavailability() 250 self.ad_hoc_unavailability(period_start="2022-08-01", period_end="2024-07-31") 251 except FileNotFoundError: 252 print("Couldn't find ad-hoc unavailability file")
Function to import CSV, add additional columns that are required and then sequentially execute other class functions to generate the probabilities and distributions required for the DES.
TODO: Additional logic is required to check the imported CSV for missing values, incorrect columns names etc.
254 def hour_by_ampds_card_probs(self): 255 """ 256 257 Calculates the proportions of calls that are triaged with 258 a specific AMPDS card. This is stratified by hour of day 259 260 TODO: Determine whether this should also be stratified by yearly quarter 261 262 """ 263 category_counts = self.df.groupby(['hour', 'ampds_card']).size().reset_index(name='count') 264 total_counts = category_counts.groupby('hour')['count'].transform('sum') 265 category_counts['proportion'] = round(category_counts['count'] / total_counts, 4) 266 267 #category_counts['ampds_card'] = category_counts['ampds_card'].apply(lambda x: str(x).zfill(2)) 268 269 category_counts.to_csv('distribution_data/hour_by_ampds_card_probs.csv', mode="w+")
Calculates the proportions of calls that are triaged with a specific AMPDS card. This is stratified by hour of day
TODO: Determine whether this should also be stratified by yearly quarter
272 def sex_by_ampds_card_probs(self): 273 """ 274 275 Calculates the probability that the patient will be female 276 stratified by AMPDS card. 277 278 """ 279 age_df = self.df 280 category_counts = age_df.groupby(['ampds_card', 'sex']).size().reset_index(name='count') 281 total_counts = category_counts.groupby('ampds_card')['count'].transform('sum') 282 category_counts['proportion'] = round(category_counts['count'] / total_counts, 3) 283 284 category_counts[category_counts['sex'] =='Female'].to_csv('distribution_data/sex_by_ampds_card_probs.csv', mode="w+")
Calculates the probability that the patient will be female stratified by AMPDS card.
286 def activity_time_distributions(self): 287 """ 288 289 Determine the 'best' distribution for each phase of a call 290 i.e. Allocation time, Mobilisation time, Time to scene 291 Time on scene, Travel time to hospital and handover, Time to clear. 292 Not all times will apply to all cases, so the class 'times_to_fit' 293 variable is a list of dictionaries, which contains the times to fit 294 295 The data is currently stratitied by HEMS_result and vehicle type fields. 296 297 """ 298 299 vehicle_type = self.df['vehicle_type'].dropna().unique() 300 301 # We'll need to make sure that where a distribution is missing that the time is set to 0 in the model. 302 # Probably easier than complicated logic to determine what times should be available based on hems_result 303 304 final_distr = [] 305 306 for row in self.times_to_fit: 307 #print(row) 308 for ttf in row['times_to_fit']: 309 for vt in vehicle_type: 310 #print(f"HEMS result is {row['hems_result']} times_to_fit is {ttf} and vehicle type is {vt}") 311 312 # This line might not be required if data quality is determined when importing the data 313 max_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].max_value_mins.iloc[0] 314 min_time = self.min_max_values_df[self.min_max_values_df['time'] == ttf].min_value_mins.iloc[0] 315 316 #print(f"Max time is {max_time} and Min time is {min_time}") 317 318 if ttf == 'time_on_scene': 319 # There is virtually no data for HEMS_result other than patient conveyed 320 # which is causing issues with fitting. For time on scene, will 321 # use a simplified fitting ignoring hems_result as a category 322 fit_times = self.df[ 323 (self.df.vehicle_type == vt) & 324 (self.df[ttf] >= min_time) & 325 (self.df[ttf] <= max_time) 326 ][ttf] 327 else: 328 fit_times = self.df[ 329 (self.df.vehicle_type == vt) & 330 (self.df[ttf] >= min_time) & 331 (self.df[ttf] <= max_time) & 332 (self.df.hems_result == row['hems_result']) 333 ][ttf] 334 #print(fit_times[:10]) 335 best_fit = self.getBestFit(fit_times, distr=self.sim_tools_distr_plus) 336 #print(best_fit) 337 338 return_dict = { "vehicle_type": vt, "time_type" : ttf, "best_fit": best_fit, "hems_result": row['hems_result'], "n": len(fit_times)} 339 #print(return_dict) 340 final_distr.append(return_dict) 341 342 with open('distribution_data/activity_time_distributions.txt', 'w+') as convert_file: 343 convert_file.write(json.dumps(final_distr)) 344 convert_file.close()
Determine the 'best' distribution for each phase of a call i.e. Allocation time, Mobilisation time, Time to scene Time on scene, Travel time to hospital and handover, Time to clear. Not all times will apply to all cases, so the class 'times_to_fit' variable is a list of dictionaries, which contains the times to fit
The data is currently stratitied by HEMS_result and vehicle type fields.
347 def age_distributions(self): 348 """ 349 350 Determine the 'best' distribution for age stratified by 351 AMPDS card 352 353 """ 354 355 age_distr = [] 356 357 age_df = self.df[["age", "ampds_card"]].dropna() 358 ampds_cards = age_df['ampds_card'].unique() 359 print(ampds_cards) 360 361 for card in ampds_cards: 362 fit_ages = age_df[age_df['ampds_card'] == card]['age'] 363 best_fit = self.getBestFit(fit_ages, distr=self.sim_tools_distr_plus) 364 return_dict = { "ampds_card": str(card), "best_fit": best_fit, "n": len(fit_ages)} 365 age_distr.append(return_dict) 366 367 with open('distribution_data/age_distributions.txt', 'w+') as convert_file: 368 convert_file.write(json.dumps(age_distr)) 369 convert_file.close()
Determine the 'best' distribution for age stratified by AMPDS card
403 def incidents_per_day(self): 404 """ 405 Fit distributions for number of incidents per day using actual daily counts, 406 applying year-based weighting to reflect trends (e.g., 2024 busier than 2023), 407 stratified by season and quarter. 408 """ 409 import math 410 import json 411 import numpy as np 412 413 inc_df = self.df[['inc_date', 'date_only', 'quarter']].copy() 414 inc_df['year'] = inc_df['date_only'].dt.year 415 416 # Daily incident counts 417 inc_per_day = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day') 418 inc_per_day['year'] = inc_per_day['date_only'].dt.year 419 420 # Merge quarter and season from self.df 421 date_info = self.df[['date_only', 'quarter']].drop_duplicates() 422 423 if 'season' not in self.df.columns: 424 date_info['season'] = date_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 425 else: 426 date_info = date_info.merge( 427 self.df[['date_only', 'season']].drop_duplicates(), 428 on='date_only', 429 how='left' 430 ) 431 432 inc_per_day = inc_per_day.merge(date_info, on='date_only', how='left') 433 434 # Weight settings - simple implementation rather than biased mean thing 435 year_weights = { 436 2023: 1.0, 437 2024: 4.0 # 10% more weight to 2024 438 } 439 440 # ========== SEASONAL DISTRIBUTIONS ========== 441 jpd_distr = [] 442 443 for season in inc_per_day['season'].dropna().unique(): 444 filtered = inc_per_day[inc_per_day['season'] == season].copy() 445 filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0) 446 447 # Repeat rows proportionally by weight 448 replicated = filtered.loc[ 449 filtered.index.repeat((filtered['weight'] * 10).round().astype(int)) 450 ]['jobs_per_day'] 451 452 best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus) 453 454 jpd_distr.append({ 455 "season": season, 456 "best_fit": best_fit, 457 "min_n_per_day": int(replicated.min()), 458 "max_n_per_day": int(replicated.max()), 459 "mean_n_per_day": float(replicated.mean()) 460 }) 461 462 with open('distribution_data/inc_per_day_distributions.txt', 'w+') as f: 463 json.dump(jpd_distr, f) 464 465 # ========== QUARTERLY DISTRIBUTIONS ========== 466 jpd_qtr_distr = [] 467 468 for quarter in inc_per_day['quarter'].dropna().unique(): 469 filtered = inc_per_day[inc_per_day['quarter'] == quarter].copy() 470 filtered['weight'] = filtered['year'].map(year_weights).fillna(1.0) 471 472 replicated = filtered.loc[ 473 filtered.index.repeat((filtered['weight'] * 10).round().astype(int)) 474 ]['jobs_per_day'] 475 476 best_fit = self.getBestFit(np.array(replicated), distr=self.sim_tools_distr_plus) 477 478 jpd_qtr_distr.append({ 479 "quarter": int(quarter), 480 "best_fit": best_fit, 481 "min_n_per_day": int(replicated.min()), 482 "max_n_per_day": int(replicated.max()), 483 "mean_n_per_day": float(replicated.mean()) 484 }) 485 486 with open('distribution_data/inc_per_day_qtr_distributions.txt', 'w+') as f: 487 json.dump(jpd_qtr_distr, f)
Fit distributions for number of incidents per day using actual daily counts, applying year-based weighting to reflect trends (e.g., 2024 busier than 2023), stratified by season and quarter.
489 def incidents_per_day_samples(self, weight_map=None, scale_factor=10): 490 """ 491 Create weighted empirical samples of incidents per day by season and quarter. 492 """ 493 494 inc_df = self.df[['date_only', 'quarter']].copy() 495 inc_df['year'] = inc_df['date_only'].dt.year 496 inc_df['season'] = inc_df['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 497 498 # Get raw counts per day 499 daily_counts = inc_df.groupby('date_only').size().reset_index(name='jobs_per_day') 500 daily_counts['year'] = daily_counts['date_only'].dt.year 501 502 # Merge back in season/quarter info 503 meta_info = self.df[['date_only', 'quarter']].drop_duplicates() 504 if 'season' in self.df.columns: 505 meta_info = meta_info.merge( 506 self.df[['date_only', 'season']].drop_duplicates(), 507 on='date_only', how='left' 508 ) 509 else: 510 meta_info['season'] = meta_info['quarter'].map(lambda q: "winter" if q in [1, 4] else "summer") 511 512 daily_counts = daily_counts.merge(meta_info, on='date_only', how='left') 513 514 # Year weight map 515 if weight_map is None: 516 weight_map = {2023: 1.0, 2024: 1.1} 517 518 # Compute weights 519 daily_counts['weight'] = daily_counts['year'].map(weight_map).fillna(1.0) 520 521 # Storage 522 empirical_samples = {} 523 524 # Season-based 525 for season in daily_counts['season'].dropna().unique(): 526 filtered = daily_counts[daily_counts['season'] == season].copy() 527 repeated = filtered.loc[ 528 filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int)) 529 ]['jobs_per_day'].tolist() 530 531 empirical_samples[season] = repeated 532 533 # Quarter-based 534 for quarter in daily_counts['quarter'].dropna().unique(): 535 filtered = daily_counts[daily_counts['quarter'] == quarter].copy() 536 repeated = filtered.loc[ 537 filtered.index.repeat((filtered['weight'] * scale_factor).round().astype(int)) 538 ]['jobs_per_day'].tolist() 539 540 empirical_samples[f"Q{int(quarter)}"] = repeated 541 542 with open("distribution_data/inc_per_day_samples.json", 'w') as f: 543 json.dump(empirical_samples, f)
Create weighted empirical samples of incidents per day by season and quarter.
546 def enhanced_or_critical_care_by_ampds_card_probs(self): 547 """ 548 549 Calculates the probabilty of enhanced or critical care resource beign required 550 based on the AMPDS card 551 552 """ 553 554 ec_df = self.df[['ampds_card', 'ec_benefit', 'cc_benefit']].copy() 555 556 def assign_care_category(row): 557 # There are some columns with both EC and CC benefit selected 558 # this function will allocate to only 1 559 if row['cc_benefit'] == 'y': 560 return 'CC' 561 elif row['ec_benefit'] == 'y': 562 return 'EC' 563 else: 564 return 'REG' 565 566 ec_df['care_category'] = ec_df.apply(assign_care_category, axis = 1) 567 568 care_cat_counts = ec_df.groupby(['ampds_card', 'care_category']).size().reset_index(name='count') 569 total_counts = care_cat_counts.groupby('ampds_card')['count'].transform('sum') 570 571 care_cat_counts['proportion'] = round(care_cat_counts['count'] / total_counts, 3) 572 573 care_cat_counts.to_csv('distribution_data/enhanced_or_critical_care_by_ampds_card_probs.csv', mode = "w+", index = False)
Calculates the probabilty of enhanced or critical care resource beign required based on the AMPDS card
575 def patient_outcome_by_care_category_and_quarter_probs(self): 576 """ 577 578 Calculates the probabilty of a patient outcome based on care category and yearly quarter 579 580 """ 581 582 po_df = self.df[['quarter', 'ec_benefit', 'cc_benefit', 'pt_outcome']].copy() 583 584 def assign_care_category(row): 585 # There are some columns with both EC and CC benefit selected 586 # this function will allocate to only 1 587 if row['cc_benefit'] == 'y': 588 return 'CC' 589 elif row['ec_benefit'] == 'y': 590 return 'EC' 591 else: 592 return 'REG' 593 594 # There are some values that are missing e.g. CC quarter 1 Deceased 595 # I think we've had problems when trying to sample from this kind of thing before 596 # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0 597 outcomes = po_df['pt_outcome'].unique() 598 care_categories = ['CC', 'EC', 'REG'] 599 quarters = po_df['quarter'].unique() 600 601 all_combinations = pd.DataFrame(list(itertools.product(outcomes, care_categories, quarters)), 602 columns=['pt_outcome', 'care_category', 'quarter']) 603 604 po_df['care_category'] = po_df.apply(assign_care_category, axis = 1) 605 606 po_cat_counts = po_df.groupby(['pt_outcome', 'care_category', 'quarter']).size().reset_index(name='count') 607 608 merged = pd.merge(all_combinations, po_cat_counts, 609 on=['pt_outcome', 'care_category', 'quarter'], 610 how='left').fillna({'count': 0}) 611 merged['count'] = merged['count'].astype(int) 612 613 total_counts = merged.groupby(['care_category', 'quarter'])['count'].transform('sum') 614 merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3) 615 616 merged.to_csv('distribution_data/patient_outcome_by_care_category_and_quarter_probs.csv', mode = "w+", index = False)
Calculates the probabilty of a patient outcome based on care category and yearly quarter
654 def hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs(self): 655 """ 656 657 Calculates the probabilty of a given HEMS result based on 658 - patient outcome 659 - yearly quarter 660 - time of day (7am - 6pm, 7pm - 6am) 661 - vehicle type 662 - and callsign group 663 664 """ 665 self.df['inc_date'] = pd.to_datetime(self.df['inc_date']) 666 self.df['hour'] = self.df['inc_date'].dt.hour 667 self.df['time_of_day'] = self.df['hour'].apply(lambda x: 'day' if x >= 7 and x <= 18 else "night") 668 669 hr_df = self.df[[ 670 'hems_result', 'quarter', 'pt_outcome', 671 'vehicle_type', 'callsign_group', 'time_of_day' 672 ]].copy() 673 674 # There are some values that are missing e.g. CC quarter 1 Deceased 675 # I think we've had problems when trying to sample from this kind of thing before 676 # As a fallback, ensure that 'missing' combinations are given a count and proportion of 0 677 # hems_results = hr_df['hems_result'].unique() 678 # outcomes = hr_df['pt_outcome'].unique() 679 # vehicle_categories = [x for x in hr_df['vehicle_type'].unique() if pd.notna(x)] 680 # callsign_group_categories = hr_df['callsign_group'].unique() 681 # quarters = hr_df['quarter'].unique() 682 683 # all_combinations = pd.DataFrame(list(itertools.product(hems_results, outcomes, vehicle_categories, callsign_group_categories, quarters)), 684 # columns=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter']) 685 686 hr_cat_counts = hr_df.groupby(['hems_result', 'pt_outcome', 687 'vehicle_type', 'callsign_group', 688 'quarter', 'time_of_day']).size().reset_index(name='count') 689 690 # merged = pd.merge(all_combinations, hr_cat_counts, 691 # on=['hems_result', 'pt_outcome', 'vehicle_type', 'callsign_group', 'quarter'], 692 # how='left').fillna({'count': 0}) 693 # merged['count'] = merged['count'].astype(int) 694 695 merged = hr_cat_counts 696 697 total_counts = merged.groupby(['pt_outcome', 698 'vehicle_type', 'callsign_group', 699 'quarter', 'time_of_day'])['count'].transform('sum') 700 merged['proportion'] = round(merged['count'] / total_counts.replace(0, 1), 3) 701 702 merged.to_csv('distribution_data/hems_results_by_patient_outcome_and_time_of_day_and_quarter_and_vehicle_type_and_callsign_group_probs.csv', mode = "w+", index = False)
Calculates the probabilty of a given HEMS result based on - patient outcome - yearly quarter - time of day (7am - 6pm, 7pm - 6am) - vehicle type - and callsign group
706 def hourly_arrival_by_qtr_probs(self): 707 """ 708 709 Calculates the proportions of calls arriving in any given hour 710 stratified by yearly quarter 711 712 """ 713 714 ia_df = self.df[['quarter', 'hour']].dropna() 715 716 hourly_counts = ia_df.groupby(['hour', 'quarter']).size().reset_index(name='count') 717 total_counts = hourly_counts.groupby(['quarter'])['count'].transform('sum') 718 hourly_counts['proportion'] = round(hourly_counts['count'] / total_counts, 4) 719 720 hourly_counts.sort_values(by=['quarter', 'hour']).to_csv('distribution_data/hourly_arrival_by_qtr_probs.csv', mode="w+")
Calculates the proportions of calls arriving in any given hour stratified by yearly quarter
723 def callsign_group_by_ampds_card_and_hour_probs(self): 724 """ 725 726 Calculates the probabilty of a specific callsign being allocated to 727 a call based on the AMPDS card category and hour of day 728 729 """ 730 callsign_counts = self.df.groupby(['ampds_card', 'hour', 'callsign_group']).size().reset_index(name='count') 731 732 total_counts = callsign_counts.groupby(['ampds_card', 'hour'])['count'].transform('sum') 733 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 734 735 callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_and_hour_probs.csv', mode = "w+", index=False)
Calculates the probabilty of a specific callsign being allocated to a call based on the AMPDS card category and hour of day
737 def callsign_group_by_ampds_card_probs(self): 738 """ 739 740 Calculates the probabilty of a specific callsign being allocated to 741 a call based on the AMPDS card category 742 743 """ 744 745 callsign_df = self.df[self.df['callsign_group'] != 'Other'] 746 747 callsign_counts = callsign_df.groupby(['ampds_card', 'callsign_group']).size().reset_index(name='count') 748 749 total_counts = callsign_counts.groupby(['ampds_card'])['count'].transform('sum') 750 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 751 752 callsign_counts.to_csv('distribution_data/callsign_group_by_ampds_card_probs.csv', mode = "w+", index=False)
Calculates the probabilty of a specific callsign being allocated to a call based on the AMPDS card category
754 def callsign_group(self): 755 """ 756 Calculates the probabilty of a specific callsign being allocated to 757 a call 758 """ 759 df = self.df.copy() 760 761 # Convert time fields to numeric 762 time_fields = [ 763 "time_allocation", "time_mobile", "time_to_scene", 764 "time_on_scene", "time_to_hospital", "time_to_clear" 765 ] 766 for col in time_fields: 767 df[col] = pd.to_numeric(df[col], errors="coerce") 768 769 # Calculate total job duration in minutes 770 df["job_duration_min"] = df[time_fields].sum(axis=1, skipna=True) 771 772 # Compute job start and end times 773 df["start_time"] = df["inc_date"] 774 df["end_time"] = df["start_time"] + pd.to_timedelta(df["job_duration_min"], unit="m") 775 776 # Sort jobs by start time 777 df = df.sort_values(by="start_time").reset_index(drop=True) 778 779 # Set to hold indices of jobs that overlap (but only the later-starting ones) 780 overlapping = set() 781 782 # Check for overlaps 783 for i in range(len(df)): 784 this_end = df.at[i, "end_time"] 785 786 # Compare only to later jobs 787 for j in range(i + 1, len(df)): 788 next_start = df.at[j, "start_time"] 789 if next_start >= this_end: 790 break # No more possible overlaps 791 # If it starts before i's job ends, it's overlapping 792 overlapping.add(j) 793 794 # Mark the overlaps in the dataframe 795 df["overlaps"] = df.index.isin(overlapping) 796 797 # Filter out overlapping jobs 798 df_no_overlap = df[~df["overlaps"]] 799 800 # We will use the ad-hoc unavailability to remove any instances where we already know one of 801 # the vehicles to be recorded as offline 802 803 data = df_no_overlap.copy() 804 805 # TODO: Ideally we'd also remove any instances where we know one of the helos to have been 806 # off for servicing if that data is available 807 ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"]) 808 ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower() 809 810 data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601") 811 data["vehicle"] = data["vehicle"].str.lower() 812 813 # Create a cross-join between data and ad_hoc 814 data['key'] = 1 815 ad_hoc['key'] = 1 816 merged = data.merge(ad_hoc, on='key') 817 818 # Keep rows where inc_date falls within the offline period 819 overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])] 820 821 # Filter out those rows from the original data 822 df_no_overlap = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key') 823 824 callsign_df = ( 825 df_no_overlap 826 .assign( 827 helicopter_benefit=np.select( 828 [ 829 df_no_overlap["cc_benefit"] == "y", 830 df_no_overlap["ec_benefit"] == "y", 831 df_no_overlap["hems_result"].isin([ 832 "Stand Down En Route", 833 "Landed but no patient contact", 834 "Stand Down Before Mobile" 835 ]) 836 ], 837 ["y", "y", "n"], 838 default=df_no_overlap["helicopter_benefit"] 839 ), 840 care_category=np.select( 841 [ 842 df_no_overlap["cc_benefit"] == "y", 843 df_no_overlap["ec_benefit"] == "y" 844 ], 845 ["CC", "EC"], 846 default="REG" 847 ) 848 ) 849 ) 850 851 callsign_df = callsign_df[callsign_df['callsign_group'] != 'Other'] 852 853 callsign_counts = callsign_df.groupby(['callsign_group']).size().reset_index(name='count') 854 855 total_counts = len(callsign_df) 856 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 857 858 callsign_counts.to_csv('distribution_data/callsign_group_probs.csv', mode = "w+", index=False)
Calculates the probabilty of a specific callsign being allocated to a call
961 def vehicle_type_by_quarter_probs(self): 962 """ 963 964 Calculates the probabilty of a car/helicopter being allocated to 965 a call based on the callsign group and quarter of the year 966 967 Quarter accounts for seasonal variation without being as affected by 968 969 """ 970 data = self.df.copy() 971 972 # We will use the ad-hoc unavailability to remove any instances where we already know one of 973 # the vehicles to be recorded as offline 974 975 # TODO: Ideally we'd also remove any instances where we know one of the helos to have been 976 # off for servicing if that data is available 977 ad_hoc = pd.read_csv("external_data/ad_hoc.csv", parse_dates=["offline", "online"]) 978 ad_hoc["aircraft"] = ad_hoc["aircraft"].str.lower() 979 980 data["inc_date"] = pd.to_datetime(data["inc_date"], format="ISO8601") 981 data["vehicle"] = data["vehicle"].str.lower() 982 983 # Create a cross-join between data and ad_hoc 984 data['key'] = 1 985 ad_hoc['key'] = 1 986 merged = data.merge(ad_hoc, on='key') 987 988 # Keep rows where inc_date falls within the offline period 989 overlap = merged[(merged['inc_date'] >= merged['offline']) & (merged['inc_date'] <= merged['online'])] 990 991 # Filter out those rows from the original data 992 filtered_data = data[~data['inc_date'].isin(overlap['inc_date'])].drop(columns='key') 993 994 # First, calculate overall props 995 callsign_counts = filtered_data.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count') 996 997 total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum') 998 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 999 1000 callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+") 1001 1002 # Then, redo by quarter 1003 callsign_counts = filtered_data.groupby(['callsign_group', 'quarter', 'vehicle_type']).size().reset_index(name='count') 1004 1005 total_counts = callsign_counts.groupby(['callsign_group', 'quarter'])['count'].transform('sum') 1006 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 1007 1008 callsign_counts.to_csv('distribution_data/vehicle_type_by_quarter_probs.csv', mode = "w+")
Calculates the probabilty of a car/helicopter being allocated to a call based on the callsign group and quarter of the year
Quarter accounts for seasonal variation without being as affected by
1010 def vehicle_type_probs(self): 1011 """ 1012 1013 Calculates the probabilty of a car/helicopter being allocated to 1014 a call based on the callsign group 1015 1016 """ 1017 1018 callsign_counts = self.df.groupby(['callsign_group', 'vehicle_type']).size().reset_index(name='count') 1019 1020 total_counts = callsign_counts.groupby(['callsign_group'])['count'].transform('sum') 1021 callsign_counts['proportion'] = round(callsign_counts['count'] / total_counts, 4) 1022 1023 callsign_counts.to_csv('distribution_data/vehicle_type_probs.csv', mode = "w+")
Calculates the probabilty of a car/helicopter being allocated to a call based on the callsign group
1025 def hems_result_by_callsign_group_and_vehicle_type_probs(self): 1026 """ 1027 1028 Calculates the probabilty of a specific HEMS result being allocated to 1029 a call based on the callsign group and hour of day 1030 1031 TODO: These probability calculation functions could probably be refactored into a single 1032 function and just specify columns and output name 1033 1034 """ 1035 hems_counts = self.df.groupby(['hems_result', 'callsign_group', 'vehicle_type']).size().reset_index(name='count') 1036 1037 total_counts = hems_counts.groupby(['callsign_group', 'vehicle_type'])['count'].transform('sum') 1038 hems_counts['proportion'] = round(hems_counts['count'] / total_counts, 4) 1039 1040 hems_counts.to_csv('distribution_data/hems_result_by_callsign_group_and_vehicle_type_probs.csv', mode = "w+", index=False)
Calculates the probabilty of a specific HEMS result being allocated to a call based on the callsign group and hour of day
TODO: These probability calculation functions could probably be refactored into a single function and just specify columns and output name
1132 def school_holidays(self) -> None: 1133 """" 1134 Function to generate a CSV file containing schoole holiday 1135 start and end dates for a given year. The Year range is determined 1136 by the submitted data (plus a year at the end of the study for good measure) 1137 """ 1138 1139 min_date = self.df.inc_date.min() 1140 max_date = self.df.inc_date.max() + timedelta(weeks = (52 * self.school_holidays_years)) 1141 1142 u = Utils() 1143 1144 years_of_holidays_list = u.years_between(min_date, max_date) 1145 1146 sh = pd.DataFrame(columns=['year', 'start_date', 'end_date']) 1147 1148 for i, year in enumerate(years_of_holidays_list): 1149 tmp = u.calculate_term_holidays(year) 1150 1151 if i == 0: 1152 sh = tmp 1153 else: 1154 sh = pd.concat([sh, tmp]) 1155 1156 sh.to_csv('actual_data/school_holidays.csv', index = False)
" Function to generate a CSV file containing schoole holiday start and end dates for a given year. The Year range is determined by the submitted data (plus a year at the end of the study for good measure)
1161 def historical_jobs_per_day_per_callsign(self): 1162 df = self.df 1163 1164 df["date"] = pd.to_datetime(df["inc_date"]).dt.date 1165 all_counts_hist = df.groupby(["date", "callsign"])["job_id"].count().reset_index() 1166 all_counts_hist.rename(columns={'job_id':'jobs_in_day'}, inplace=True) 1167 1168 all_combinations = pd.DataFrame( 1169 list(itertools.product(df['date'].unique(), df['callsign'].unique())), 1170 columns=['date', 'callsign'] 1171 ).dropna() 1172 1173 merged = all_combinations.merge(all_counts_hist, on=['date', 'callsign'], how='left') 1174 merged['jobs_in_day'] = merged['jobs_in_day'].fillna(0).astype(int) 1175 1176 all_counts = merged.groupby(['callsign', 'jobs_in_day']).count().reset_index().rename(columns={"date":"count"}) 1177 all_counts.to_csv("historical_data/historical_jobs_per_day_per_callsign.csv", index=False)
1179 def historical_care_cat_counts(self): 1180 """ 1181 Process historical incident data to categorize care types and compute hourly counts. 1182 1183 This method performs the following steps: 1184 - Converts incident dates to datetime format. 1185 - Extracts month start and hour from the incident date. 1186 - Categorizes each incident into care categories based on benefit flags and attendance. 1187 - Counts the number of incidents by hour and care category. 1188 - Outputs these counts to a CSV file. 1189 - Computes and writes the proportion of regular care jobs with a helicopter benefit 1190 (excluding those not attended by a DAA resource) to a text file. 1191 1192 Outputs: 1193 - CSV file: 'historical_data/historical_care_cat_counts.csv' 1194 - Text file: 'distribution_data/proportion_jobs_heli_benefit.txt' 1195 """ 1196 1197 df_historical = self.df 1198 1199 df_historical['inc_date'] = pd.to_datetime(df_historical['inc_date']) 1200 # Extract the first day of the month and the hour of each incident 1201 df_historical['month_start'] = df_historical.inc_date.dt.strftime("%Y-%m-01") 1202 df_historical['hour'] = df_historical.inc_date.dt.hour 1203 1204 conditions = [ 1205 df_historical['cc_benefit'] == 'y', 1206 df_historical['ec_benefit'] == 'y', 1207 df_historical['helicopter_benefit'] == 'y', 1208 df_historical['callsign_group'] == 'Other' 1209 ] 1210 1211 choices = [ 1212 'CC', 1213 'EC', 1214 'REG - helicopter benefit', 1215 'Unknown - DAA resource did not attend' 1216 ] 1217 # Assign care category to each record 1218 # If the case did not meet any of the criteria in 'conditions', it will default 1219 # to being labelled as a 'regular/REG' case (i.e there was no benefit recorded) 1220 df_historical['care_category'] = np.select(conditions, choices, default='REG') 1221 1222 # Count occurrences grouped by hour and care category 1223 historical_value_counts_by_hour = ( 1224 df_historical.value_counts(["hour", "care_category"]) 1225 .reset_index(name="count") 1226 ) 1227 # Output to CSV for use in tests and visualisations 1228 (historical_value_counts_by_hour 1229 .sort_values(['hour', 'care_category']) 1230 .to_csv("historical_data/historical_care_cat_counts.csv")) 1231 1232 # Also output the % of regular (not cc/ec) jobs with a helicopter benefit 1233 # These are the regular jobs we will make an assumption follow different logic due to having an obvious expected 1234 # patient benefit of having a helicopter allocated to them that we will have to assume is apparent at the time 1235 # of the call being placed (such as the casualty being located in a remote location, or ) 1236 1237 numerator = (historical_value_counts_by_hour[ 1238 historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit" 1239 ]["count"].sum()) 1240 1241 denominator = (historical_value_counts_by_hour[ 1242 (historical_value_counts_by_hour["care_category"] == "REG - helicopter benefit") | 1243 (historical_value_counts_by_hour["care_category"] == "REG") 1244 ]["count"].sum()) 1245 1246 with open('distribution_data/proportion_jobs_heli_benefit.txt', 'w+') as heli_benefit_file: 1247 heli_benefit_file.write(json.dumps((numerator/denominator).round(4))) 1248 1249 # Count occurrences grouped by hour and care category 1250 historical_value_counts_by_hour_cc_ec = ( 1251 df_historical.value_counts(["hour", "care_category", "helicopter_benefit"]) 1252 .reset_index(name="count") 1253 ) 1254 1255 # Output to CSV for use in tests and visualisations 1256 # (historical_value_counts_by_hour_cc_ec 1257 # .sort_values(["hour", "care_category", "helicopter_benefit"]) 1258 # .to_csv("historical_data/historical_care_cat_counts_cc_ec.csv")) 1259 1260 1261 numerator_cc = ( 1262 historical_value_counts_by_hour_cc_ec[ 1263 (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") & 1264 (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y") 1265 ]["count"].sum()) 1266 1267 denominator_cc = ( 1268 historical_value_counts_by_hour_cc_ec[ 1269 (historical_value_counts_by_hour_cc_ec["care_category"] == "CC") 1270 ]["count"].sum()) 1271 1272 with open('distribution_data/proportion_jobs_heli_benefit_cc.txt', 'w+') as heli_benefit_file: 1273 heli_benefit_file.write(json.dumps((numerator_cc/denominator_cc).round(4))) 1274 1275 numerator_ec = ( 1276 historical_value_counts_by_hour_cc_ec[ 1277 (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") & 1278 (historical_value_counts_by_hour_cc_ec["helicopter_benefit"] == "y") 1279 ]["count"].sum()) 1280 1281 denominator_ec = ( 1282 historical_value_counts_by_hour_cc_ec[ 1283 (historical_value_counts_by_hour_cc_ec["care_category"] == "EC") 1284 ]["count"].sum()) 1285 1286 with open('distribution_data/proportion_jobs_heli_benefit_ec.txt', 'w+') as heli_benefit_file: 1287 heli_benefit_file.write(json.dumps((numerator_ec/denominator_ec).round(4)))
Process historical incident data to categorize care types and compute hourly counts.
This method performs the following steps:
- Converts incident dates to datetime format.
- Extracts month start and hour from the incident date.
- Categorizes each incident into care categories based on benefit flags and attendance.
- Counts the number of incidents by hour and care category.
- Outputs these counts to a CSV file.
- Computes and writes the proportion of regular care jobs with a helicopter benefit (excluding those not attended by a DAA resource) to a text file.
Outputs:
- CSV file: 'historical_data/historical_care_cat_counts.csv'
- Text file: 'distribution_data/proportion_jobs_heli_benefit.txt'
1291 def historical_monthly_totals(self): 1292 """ 1293 Calculates monthly incident totals from provided dataset of historical data 1294 """ 1295 1296 # Multiple resources can be sent to the same job. 1297 monthly_df = self.df[['inc_date', 'first_day_of_month', 'hems_result', 'vehicle_type']]\ 1298 .drop_duplicates(subset="inc_date", keep="first") 1299 1300 is_stand_down = monthly_df['hems_result'].str.contains("Stand Down") 1301 monthly_df['stand_down_car'] = ((monthly_df['vehicle_type'] == "car") & is_stand_down).astype(int) 1302 monthly_df['stand_down_helicopter'] = ((monthly_df['vehicle_type'] == "helicopter") & is_stand_down).astype(int) 1303 1304 monthly_totals_df = monthly_df.groupby('first_day_of_month').agg( 1305 stand_down_car=('stand_down_car', 'sum'), 1306 stand_down_helicopter=('stand_down_helicopter', 'sum'), 1307 total_jobs=('vehicle_type', 'size') 1308 ).reset_index() 1309 1310 monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_jobs_per_month.csv', mode="w+", index=False)
Calculates monthly incident totals from provided dataset of historical data
1312 def historical_monthly_totals_all_calls(self): 1313 """ 1314 Calculates monthly incident totals from provided dataset of historical data stratified by callsign 1315 """ 1316 1317 # Multiple resources can be sent to the same job. 1318 monthly_df = self.df[['inc_date', 'first_day_of_month']].dropna() 1319 1320 monthly_totals_df = monthly_df.groupby(['first_day_of_month']).count().reset_index() 1321 1322 monthly_totals_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_all_calls.csv', mode="w+", index=False)
Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1324 def historical_monthly_totals_by_callsign(self): 1325 """ 1326 Calculates monthly incident totals from provided dataset of historical data stratified by callsign 1327 """ 1328 1329 # Multiple resources can be sent to the same job. 1330 monthly_df = self.df[['inc_date', 'first_day_of_month', 'callsign']].dropna() 1331 1332 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'callsign']).count().reset_index() 1333 1334 #print(monthly_totals_df.head()) 1335 1336 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1337 1338 #print(monthly_totals_pivot_df.head()) 1339 1340 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_callsign.csv', mode="w+", index=False)
Calculates monthly incident totals from provided dataset of historical data stratified by callsign
1342 def historical_monthly_totals_by_hour_of_day(self): 1343 """ 1344 Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day 1345 """ 1346 1347 # Multiple resources can be sent to the same job. 1348 monthly_df = self.df[['inc_date', 'first_day_of_month', 'hour']].dropna()\ 1349 .drop_duplicates(subset="inc_date", keep="first") 1350 1351 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'hour']).count().reset_index() 1352 1353 #print(monthly_totals_df.head()) 1354 1355 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='hour', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1356 1357 #print(monthly_totals_pivot_df.head()) 1358 1359 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_hour_of_day.csv', mode="w+", index=False)
Calculates monthly incident totals from provided dataset of historical data stratified by hour of the day
1361 def historical_monthly_totals_by_day_of_week(self): 1362 """ 1363 Calculates number of incidents per month stratified by day of the week 1364 """ 1365 1366 # Multiple resources can be sent to the same job. 1367 monthly_df = self.df[['inc_date', 'first_day_of_month', 'day_of_week']].dropna()\ 1368 .drop_duplicates(subset="inc_date", keep="first") 1369 1370 monthly_totals_df = monthly_df.groupby(['first_day_of_month', 'day_of_week']).count().reset_index() 1371 1372 #print(monthly_totals_df.head()) 1373 1374 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='day_of_week', values='inc_date').fillna(0).reset_index().rename_axis(None, axis=1) 1375 1376 #print(monthly_totals_pivot_df.head()) 1377 1378 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_totals_by_day_of_week.csv', mode="w+", index=False)
Calculates number of incidents per month stratified by day of the week
1380 def historical_median_time_of_activities_by_month_and_resource_type(self): 1381 """ 1382 Calculate the median time for each of the job cycle phases stratified by month and vehicle type 1383 """ 1384 1385 median_df = self.df[['first_day_of_month', 'time_allocation', 1386 'time_mobile', 'time_to_scene', 'time_on_scene', 1387 'time_to_hospital', 'time_to_clear', 'vehicle_type']].copy() 1388 1389 median_df['total_job_time'] = median_df[[ 1390 'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 1391 'time_to_hospital', 'time_to_clear']].sum(axis=1, skipna=True) 1392 1393 # Replacing zeros with NaN to exclude from median calculation 1394 # since if an HEMS result is Stood down en route, then time_on_scene would be zero and affect the median 1395 # median_df.replace(0, np.nan, inplace=True) 1396 1397 # Grouping by month and resource_type, calculating medians 1398 median_times = median_df.groupby(['first_day_of_month', 'vehicle_type']).median(numeric_only=True).reset_index() 1399 1400 pivot_data = median_times.pivot_table( 1401 index='first_day_of_month', 1402 columns='vehicle_type', 1403 values=['time_allocation', 'time_mobile', 'time_to_scene', 1404 'time_on_scene', 'time_to_hospital', 'time_to_clear', 'total_job_time'] 1405 ) 1406 1407 pivot_data.columns = [f"median_{col[1]}_{col[0]}" for col in pivot_data.columns] 1408 pivot_data = pivot_data.reset_index() 1409 1410 pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)
Calculate the median time for each of the job cycle phases stratified by month and vehicle type
1412 def historical_monthly_resource_utilisation(self): 1413 """ 1414 Calculates number of, and time spent on, incidents per month stratified by callsign 1415 """ 1416 1417 # Multiple resources can be sent to the same job. 1418 monthly_df = self.df[[ 1419 'inc_date', 'first_day_of_month', 'callsign', 'time_allocation', 1420 'time_mobile', 'time_to_scene', 'time_on_scene', 'time_to_hospital', 1421 'time_to_clear']].copy() 1422 1423 monthly_df['total_time'] = monthly_df.filter(regex=r'^time_').sum(axis=1) 1424 1425 monthly_totals_df = monthly_df.groupby(['callsign', 'first_day_of_month'], as_index=False)\ 1426 .agg(n = ('callsign', 'size'), total_time = ('total_time', 'sum')) 1427 1428 monthly_totals_pivot_df = monthly_totals_df.pivot(index='first_day_of_month', columns='callsign', values=['n', 'total_time']) 1429 1430 monthly_totals_pivot_df.columns = [f"{col[0]}_{col[1]}" for col in monthly_totals_pivot_df.columns] 1431 monthly_totals_pivot_df = monthly_totals_pivot_df.reset_index() 1432 1433 monthly_totals_pivot_df.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_monthly_resource_utilisation.csv', mode="w+", index=False)
Calculates number of, and time spent on, incidents per month stratified by callsign
1435 def historical_daily_calls_breakdown(self): 1436 1437 df = self.df 1438 # Convert inc_date to date only (remove time) 1439 df['date'] = pd.to_datetime(df['inc_date']).dt.date 1440 1441 # Count number of calls per day 1442 calls_in_day_breakdown = df.groupby('date').size().reset_index(name='calls_in_day') 1443 1444 # Save the daily call counts with a 'day' index column 1445 calls_in_day_breakdown_with_day = calls_in_day_breakdown.copy() 1446 calls_in_day_breakdown_with_day.insert(0, 'day', range(1, len(calls_in_day_breakdown) + 1)) 1447 calls_in_day_breakdown_with_day.drop(columns='date').to_csv('historical_data/historical_daily_calls_breakdown.csv', index=False) 1448 1449 # Count how many days had the same number of calls 1450 calls_per_day_summary = calls_in_day_breakdown['calls_in_day'].value_counts().reset_index() 1451 calls_per_day_summary.columns = ['calls_in_day', 'days'] 1452 calls_per_day_summary.to_csv('historical_data/historical_daily_calls.csv', index=False)
1454 def historical_missed_jobs(self): 1455 df = self.df 1456 df["date"] = pd.to_datetime(df["inc_date"]) 1457 df["hour"] = df["date"].dt.hour 1458 df["month_start"] = df["date"].dt.strftime("%Y-%m-01") 1459 df["callsign_group_simplified"] = df["callsign_group"].apply(lambda x: "No HEMS available" if x=="Other" else "HEMS (helo or car) available and sent") 1460 df["quarter"] = df["inc_date"].dt.quarter 1461 1462 # By month 1463 count_df_month = df[["callsign_group_simplified", "month_start"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','month_start']) 1464 count_df_month.to_csv("historical_data/historical_missed_calls_by_month.csv", index=False) 1465 1466 # By hour 1467 count_df = df[["callsign_group_simplified", "hour"]].value_counts().reset_index(name="count").sort_values(['callsign_group_simplified','hour']) 1468 count_df.to_csv("historical_data/historical_missed_calls_by_hour.csv", index=False) 1469 1470 # By quarter and hour 1471 count_df_quarter = df[["callsign_group_simplified", "quarter", "hour"]].value_counts().reset_index(name="count").sort_values(['quarter','callsign_group_simplified','hour']) 1472 count_df_quarter.to_csv("historical_data/historical_missed_calls_by_quarter_and_hour.csv", index=False)
1474 def upper_allowable_time_bounds(self): 1475 """ 1476 Calculates the maximum permissable time for each phase on an incident based on supplied historical data. 1477 This is currently set to 1.5x the upper quartile of the data distribution 1478 """ 1479 1480 median_df = self.df[[ 1481 'time_allocation', 'time_mobile', 'time_to_scene', 'time_on_scene', 1482 'time_to_hospital', 'time_to_clear', 'vehicle_type']] 1483 1484 # Replacing zeros with NaN to exclude from median calculation 1485 # since if an HEMS result is Stood down en route, then time_on_scene 1486 # would be zero and affect the median 1487 median_df.replace(0, np.nan, inplace=True) 1488 1489 print(median_df.quantile(.75)) 1490 # pivot_data.rename(columns={'first_day_of_month': 'month'}).to_csv('historical_data/historical_median_time_of_activities_by_month_and_resource_type.csv', mode="w+", index=False)
Calculates the maximum permissable time for each phase on an incident based on supplied historical data. This is currently set to 1.5x the upper quartile of the data distribution
1492 def historical_job_durations_breakdown(self): 1493 1494 df = self.df 1495 1496 cols = [ 1497 'callsign', 'vehicle_type', 1498 'time_allocation', 'time_mobile', 1499 'time_to_scene', 'time_on_scene', 1500 'time_to_hospital', 'time_to_clear' 1501 ] 1502 df2 = df[cols].copy() 1503 1504 # 2. Add a 1-based row identifier 1505 df2['job_identifier'] = range(1, len(df2) + 1) 1506 1507 # 3. Compute total_duration as the row-wise sum of the time columns 1508 time_cols = [ 1509 'time_allocation', 'time_mobile', 1510 'time_to_scene', 'time_on_scene', 1511 'time_to_hospital', 'time_to_clear' 1512 ] 1513 df2['total_duration'] = df2[time_cols].sum(axis=1, skipna=True) 1514 1515 #print(df2.head()) 1516 1517 # 4. Pivot (melt) to long format 1518 df_long = df2.melt( 1519 id_vars=['job_identifier', 'callsign', 'vehicle_type'], 1520 value_vars=time_cols + ['total_duration'], 1521 var_name='name', 1522 value_name='value' 1523 ) 1524 1525 #print(df_long[df_long.job_identifier == 1]) 1526 1527 # 5. Drop any rows where callsign or vehicle_type is missing 1528 df_long = df_long.dropna(subset=['callsign', 'vehicle_type']) 1529 df_long_sorted = df_long.sort_values("job_identifier").reset_index(drop=True) 1530 1531 # 6. Write out to CSV 1532 df_long_sorted.to_csv("historical_data/historical_job_durations_breakdown.csv", index=False)
2243 def run_sim_on_historical_params(self): 2244 # Ensure all rotas are using default values 2245 rota = pd.read_csv("tests/rotas_historic/HISTORIC_HEMS_ROTA.csv") 2246 rota.to_csv("actual_data/HEMS_ROTA.csv", index=False) 2247 2248 callsign_reg_lookup = pd.read_csv("tests/rotas_historic/HISTORIC_callsign_registration_lookup.csv") 2249 callsign_reg_lookup.to_csv("actual_data/callsign_registration_lookup.csv", index=False) 2250 2251 service_history = pd.read_csv("tests/rotas_historic/HISTORIC_service_history.csv") 2252 service_history.to_csv("actual_data/service_history.csv", index=False) 2253 2254 service_sched = pd.read_csv("tests/rotas_historic/HISTORIC_service_schedules_by_model.csv") 2255 service_sched.to_csv("actual_data/service_schedules_by_model.csv", index=False) 2256 2257 print("Generating simulation results...") 2258 removeExistingResults() 2259 2260 total_runs = 30 2261 sim_years = 2 2262 sim_duration = 60 * 24 * 7 * 52 * sim_years 2263 2264 parallelProcessJoblib( 2265 total_runs=total_runs, 2266 sim_duration=sim_duration, 2267 warm_up_time=0, 2268 sim_start_date=datetime.strptime("2023-01-01 05:00:00", "%Y-%m-%d %H:%M:%S"), 2269 amb_data=False, 2270 print_debug_messages=True 2271 ) 2272 2273 collateRunResults() 2274 2275 try: 2276 results_all_runs = pd.read_csv("data/run_results.csv") 2277 # results_all_runs.to_csv("historical_data/calculated/SIM_hist_params.csv", index=False) 2278 2279 # save data of counts of suboptimal care category sent 2280 counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", 'hems_res_category', "care_cat"]].value_counts().reset_index() 2281 2282 counts_df_summary = counts_df.groupby(["hems_res_category", "care_cat"])["count"].agg(["mean", "min", "max"]).reset_index() 2283 2284 counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_care_cat_sent_summary.csv") 2285 2286 # save data of counts of suboptimal vehicle type sent 2287 counts_df = results_all_runs[results_all_runs["event_type"]=="resource_use"][["run_number", "vehicle_type", "heli_benefit"]].value_counts().reset_index() 2288 2289 counts_df_summary = counts_df.groupby(["vehicle_type", "heli_benefit"])["count"].agg(["mean", "min", "max"]).reset_index() 2290 2291 counts_df_summary.to_csv("historical_data/calculated/SIM_hist_params_suboptimal_vehicle_type_sent_summary.csv") 2292 2293 2294 # # Also run the model to get some base-case outputs 2295 # resource_requests = ( 2296 # results_all_runs[results_all_runs["event_type"] == "resource_request_outcome"] 2297 # .copy() 2298 # ) 2299 2300 # resource_requests["care_cat"] = ( 2301 # resource_requests.apply(lambda x: "REG - Helicopter Benefit" if x["heli_benefit"]=="y" 2302 # and x["care_cat"]=="REG" else x["care_cat"], 2303 # axis=1) 2304 # ) 2305 2306 # missed_jobs_care_cat_summary = ( 2307 # resource_requests[["care_cat", "time_type"]].value_counts().reset_index(name="jobs") 2308 # .sort_values(["care_cat", "time_type"]) 2309 # .copy() 2310 # ) 2311 2312 # missed_jobs_care_cat_summary["jobs_average"] = ( 2313 # missed_jobs_care_cat_summary["jobs"]/ 2314 # total_runs 2315 # ) 2316 2317 # missed_jobs_care_cat_summary["jobs_per_year_average"] = ( 2318 # (missed_jobs_care_cat_summary["jobs_average"] / float(sim_years*365)*365) 2319 # ).round(0) 2320 2321 missed_jobs_care_cat_summary = _job_outcome_calculation.get_missed_call_df( 2322 results_all_runs=results_all_runs, 2323 run_length_days = float(sim_years*365), 2324 what="summary" 2325 ) 2326 2327 missed_jobs_care_cat_summary.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_summary.csv") 2328 2329 2330 missed_jobs_care_cat_breakdown = _job_outcome_calculation.get_missed_call_df( 2331 results_all_runs=results_all_runs, 2332 run_length_days = float(sim_years*365), 2333 what="breakdown" 2334 ) 2335 2336 missed_jobs_care_cat_breakdown.to_csv("historical_data/calculated/SIM_hist_params_missed_jobs_care_cat_breakdown.csv") 2337 2338 except FileNotFoundError: 2339 pass