// // C45Learner.m // Pennyworth // // Created by Chris Karr on 3/29/08. // Copyright 2008 Chris J. Karr. All rights reserved. // #import "C45Learner.h" #import "LabelManager.h" #import "Context.h" NSInteger C45Sort (id one, id two, void * context) { if (([one isEqual:@"true"] || [one isEqual:@"false"]) && ([two isEqual:@"true"] || [two isEqual:@"false"])) return [two compare:one]; return [one compare:two]; } @implementation C45Learner @synthesize multipleValueList; - (CGFloat) classInfoForExamples:(NSArray *) exampleArray attribute:(NSString *) attribute { NSUInteger s = [exampleArray count]; NSMutableDictionary * exampleCounts = [NSMutableDictionary dictionary]; for (NSDictionary * example in exampleArray) { NSString * label = [example valueForKey:LABEL_KEY]; NSNumber * count = [exampleCounts valueForKey:label]; if (count == nil) count = [NSNumber numberWithInt:0]; if (attribute != nil) { if ([example valueForKey:attribute] != nil) count = [NSNumber numberWithInt:([count integerValue] + 1)]; else s = s - 1; } else count = [NSNumber numberWithInt:([count integerValue] + 1)]; [exampleCounts setValue:count forKey:label]; } // If all values are unknown... CGFloat sum = 0; // for (NSString * outcome in [exampleCounts allKeys]) for (NSNumber * value in [exampleCounts allValues]) { CGFloat valueFloat = [value doubleValue]; if (valueFloat != 0 && s != 0) { CGFloat f = valueFloat / ((CGFloat) s); sum += (f * log2 (f)); } } return 0 - sum; } - (CGFloat) attributeInfoForExamples:(NSArray *) exampleArray attribute:(NSString *) attribute { NSMutableDictionary * partition = [NSMutableDictionary dictionary]; CGFloat count = 0; for (NSDictionary * example in exampleArray) { NSObject * value = [example valueForKey:attribute]; if (value != nil) { NSMutableArray * attributeMatches = [partition objectForKey:value]; if (attributeMatches == nil) { attributeMatches = [NSMutableArray array]; [partition setObject:attributeMatches forKey:value]; } [attributeMatches addObject:example]; count = count + 1; } } CGFloat sum = 0; for (NSMutableArray * matches in [partition allValues]) { CGFloat weight = (((CGFloat) [matches count]) / ((CGFloat) count)); sum += weight * [self classInfoForExamples:matches attribute:attribute]; } return sum; } - (CGFloat) gainForExamples:(NSArray *) exampleArray attribute:(NSString *) attribute { // See Quinlan, p. 29 CGFloat knowns = 0; for (NSDictionary * example in exampleArray) { if ([example valueForKey:attribute] != nil) knowns = knowns + 1; } CGFloat classInfo = [self classInfoForExamples:exampleArray attribute:attribute]; CGFloat attInfo = [self attributeInfoForExamples:exampleArray attribute:attribute]; CGFloat gain = (knowns / ((CGFloat) [exampleArray count])) * (classInfo - attInfo); return gain; } - (CGFloat) gainRatioForExamples:(NSArray *) exampleArray attribute:(NSString *) attribute { NSMutableDictionary * partition = [NSMutableDictionary dictionary]; for (NSDictionary * example in exampleArray) { NSObject * value = [example valueForKey:attribute]; if (value == nil) value = UNKNOWN_VALUE; NSMutableArray * attributeMatches = [partition objectForKey:value]; if (attributeMatches == nil) { attributeMatches = [NSMutableArray array]; [partition setObject:attributeMatches forKey:value]; } [attributeMatches addObject:example]; } CGFloat splitInfo = 0; for (NSArray * matches in [partition allValues]) { CGFloat exampleRatio = (((CGFloat) [matches count]) / ((CGFloat) [exampleArray count])); splitInfo -= exampleRatio * log2 (exampleRatio); } CGFloat gain = [self gainForExamples:exampleArray attribute:attribute]; if (splitInfo == 0) return 0; CGFloat gainRatio = gain / splitInfo; return gainRatio; } - (NSDictionary *) bestFeatureForExamples:(NSArray *) exampleArray { NSMutableSet * exampleAttributes = [NSMutableSet set]; for (NSDictionary * example in exampleArray) [exampleAttributes addObjectsFromArray:[example allKeys]]; [exampleAttributes removeObject:LABEL_KEY]; NSDictionary * bestFeature = nil; CGFloat bestGain = 0; for (NSString * attribute in exampleAttributes) { CGFloat gain = bestGain; NSMutableDictionary * thisFeature = [NSMutableDictionary dictionaryWithObject:attribute forKey:FEATURE_NAME]; if ([attribute rangeOfString:NUMERIC].location == NSNotFound) gain = [self gainRatioForExamples:exampleArray attribute:attribute]; else { // Handle continous values CGFloat attBestGain = 0.0; NSNumber * split = nil; // NSMutableArray * examplesCopy = [[NSUnarchiver unarchiveObjectWithData:[NSArchiver archivedDataWithRootObject:exampleArray]] mutableCopy]; NSMutableArray * examplesCopy = [NSMutableArray array]; for (NSDictionary * example in exampleArray) { if ([example count] > 0) [examplesCopy addObject:[NSMutableDictionary dictionaryWithDictionary:example]]; } NSSortDescriptor * sort = [[NSSortDescriptor alloc] initWithKey:attribute ascending:YES]; [examplesCopy sortUsingDescriptors:[NSArray arrayWithObject:sort]]; NSMutableSet * values = [NSMutableSet set]; for (NSMutableDictionary * example in examplesCopy) { NSNumber * value = [example valueForKey:attribute]; if (value != nil) [values addObject:value]; } for (NSNumber * value in [[values allObjects] sortedArrayUsingSelector:@selector(compare:)]) { NSString * tempAttribute = [NSString stringWithFormat:@"%@ >= %f %@", attribute, [value doubleValue], NUMERIC]; for (NSMutableDictionary * example in examplesCopy) { NSNumber * exampleValue = [example valueForKey:attribute]; if (exampleValue != nil) { if ([exampleValue doubleValue] >= [value doubleValue]) [example setValue:@"true" forKey:tempAttribute]; else [example setValue:@"false" forKey:tempAttribute]; } } CGFloat valueGain = [self gainRatioForExamples:examplesCopy attribute:tempAttribute]; if (valueGain >= attBestGain) { attBestGain = valueGain; split = value; } } // NSLog (@"for attribute %@, split at %@ (gain = %f)", attribute, split, attBestGain); gain = attBestGain; // attribute = [NSString stringWithFormat:@"%@ <= %@ %@", attribute, split, NUMERIC, nil]; [thisFeature setValue:split forKey:SPLIT_VALUE]; } if (gain > bestGain) { bestGain = gain; bestFeature = thisFeature; } } return bestFeature; } - (NSMutableDictionary *) treeForExamples:(NSArray *) exampleArray { NSMutableDictionary * node = [NSMutableDictionary dictionary]; NSMutableDictionary * outcomes = [NSMutableDictionary dictionary]; // Test to see if all outcomes are the same... NSString * last = nil; BOOL match = YES; for (NSDictionary * example in exampleArray) { NSString * outcome = [example valueForKey:LABEL_KEY]; if (last == nil) last = outcome; else if (![last isEqual:outcome]) match = NO; NSNumber * count = [outcomes valueForKey:outcome]; if (count == nil) count = [NSNumber numberWithInteger:0]; [outcomes setValue:[NSNumber numberWithInteger:(1 + [count intValue])] forKey:outcome]; } if (match) { [node setValue:last forKey:LABEL_KEY]; [node setValue:last forKey:MAX_OUTCOME]; [node setValue:[NSNumber numberWithInteger:[exampleArray count]] forKey:EXAMPLE_COUNT]; [node setValue:[NSNumber numberWithInteger:0] forKey:EXAMPLES_WRONG]; [node setValue:[NSNumber numberWithInteger:1] forKey:OUTCOME_COUNT]; } else { NSDictionary * feature = [self bestFeatureForExamples:exampleArray]; // Calculate the maximum outcome NSInteger max = 0; NSInteger sum = 0; NSString * maxOutcome = nil; for (NSString * outcome in [outcomes allKeys]) { NSInteger count = [[outcomes valueForKey:outcome] integerValue]; sum += count; if (count >= max) { maxOutcome = outcome; max = count; } } [node setValue:maxOutcome forKey:MAX_OUTCOME]; [node setValue:[NSNumber numberWithInteger:[exampleArray count]] forKey:EXAMPLE_COUNT]; [node setValue:[NSNumber numberWithInteger:(sum - max)] forKey:EXAMPLES_WRONG]; [node setValue:[NSNumber numberWithInteger:[[outcomes allKeys] count]] forKey:OUTCOME_COUNT]; if (feature == nil) { // Unknown feature [node setValue:maxOutcome forKey:LABEL_KEY]; } else { NSString * featureName = [feature valueForKey:FEATURE_NAME]; NSNumber * split = [feature valueForKey:SPLIT_VALUE]; // Outcomes differ - build a tree... NSMutableDictionary * matchingExamples = [NSMutableDictionary dictionary]; [node setValue:feature forKey:FEATURE_SENSOR]; NSMutableArray * unknowns = [NSMutableArray array]; for (NSMutableDictionary * example in exampleArray) { NSMutableDictionary * thisExample = [NSMutableDictionary dictionaryWithDictionary:[example copy]]; NSString * value = nil; if (split == nil) { value = [[thisExample valueForKey:featureName] description]; [thisExample removeObjectForKey:featureName]; } else { NSNumber * number = [thisExample valueForKey:featureName]; if ([number doubleValue] >= [split doubleValue]) value = @"true"; else value = @"false"; } if (value == nil) [unknowns addObject:thisExample]; else { NSMutableArray * matches = [matchingExamples valueForKey:value]; if (matches == nil) { matches = [NSMutableArray array]; [matchingExamples setValue:matches forKey:value]; } [matches addObject:thisExample]; } } for (NSString * exampleKey in [matchingExamples allKeys]) { NSMutableArray * matches = [matchingExamples valueForKey:exampleKey]; NSUInteger matchCount = [matches count]; [matches addObjectsFromArray:unknowns]; NSMutableDictionary * child = [self treeForExamples:matches]; [child setValue:[NSNumber numberWithInteger:matchCount] forKey:EXAMPLE_COUNT]; [node setValue:child forKey:exampleKey]; } [node setValue:[NSNumber numberWithInteger:[exampleArray count]] forKey:EXAMPLE_COUNT]; } } return node; } - (CGFloat) prune:(NSMutableDictionary *) tree { NSMutableArray * allKeys = [NSMutableArray arrayWithArray:[tree allKeys]]; NSString * label = [tree valueForKey:LABEL_KEY]; CGFloat wrong = [[tree valueForKey:EXAMPLES_WRONG] floatValue]; CGFloat all = [[tree valueForKey:EXAMPLE_COUNT] floatValue]; CGFloat count = [[tree valueForKey:OUTCOME_COUNT] floatValue]; CGFloat myError = (wrong + (count - 1)) / (all + count); if (label != nil) return myError; [allKeys removeObject:FEATURE_SENSOR]; [allKeys removeObject:EXAMPLE_COUNT]; [allKeys removeObject:EXAMPLES_WRONG]; [allKeys removeObject:OUTCOME_COUNT]; [allKeys removeObject:MAX_OUTCOME]; CGFloat nodesError = 0.0; for (NSString * nodeKey in allKeys) { NSMutableDictionary * node = [tree valueForKey:nodeKey]; CGFloat nodeCount = [[node valueForKey:EXAMPLE_COUNT] floatValue]; CGFloat ratio = nodeCount / all; nodesError += ratio * [self prune:node]; } if (myError < nodesError) { // prune for (NSString * nodeKey in allKeys) [tree removeObjectForKey:nodeKey]; [tree setValue:[tree valueForKey:MAX_OUTCOME] forKey:LABEL_KEY]; return myError; } return nodesError; } - (void) addExample:(NSArray *) features forClass:(NSString *) label { if (self.multipleValueList == nil) self.multipleValueList = [NSMutableDictionary dictionary]; NSMutableDictionary * example = [NSMutableDictionary dictionaryWithObject:label forKey:LABEL_KEY]; for (NSDictionary * feature in features) { NSObject * value = [feature valueForKey:FEATURE_OBSERVATION]; NSString * name = [feature valueForKey:FEATURE_SENSOR]; if ([value isKindOfClass:[NSArray class]]) { NSMutableSet * multipleValues = [self.multipleValueList valueForKey:name]; if (multipleValues == nil) { multipleValues = [NSMutableSet set]; [self.multipleValueList setValue:multipleValues forKey:name]; } NSArray * valueArray = (NSArray *) value; for (NSObject * item in valueArray) { NSString * newName = [NSString stringWithFormat:@"%@ :: %@", name, item, nil]; [example setValue:@"true" forKey:newName]; [multipleValues addObject:newName]; } } else if ([value isKindOfClass:[NSNumber class]]) { NSString * newName = [NSString stringWithFormat:@"%@ %@", name, NUMERIC]; [example setValue:value forKey:newName]; } else [example setValue:value forKey:name]; } [self.examples addObject:example]; for (NSSet * values in [self.multipleValueList allValues]) { for (NSString * name in values) { for (NSDictionary * example in self.examples) { if ([example valueForKey:name] == nil) [example setValue:@"false" forKey:name]; } } } [self save]; [self load]; } - (NSString *) labelForTree:(NSDictionary *) treeDict features:(NSArray *) features { NSString * name = [[treeDict valueForKey:FEATURE_SENSOR] valueForKey:FEATURE_NAME]; NSNumber * split = [[treeDict valueForKey:FEATURE_SENSOR] valueForKey:SPLIT_VALUE]; NSString * label = [treeDict valueForKey:LABEL_KEY]; if (label != nil) return label; // Swap out soon. if (name == nil) return nil; NSString * value = @"false"; // NSLog (@"[%@] name = %@; split = %@", key, name, split); for (NSDictionary * feature in features) { NSObject * observation = [feature valueForKey:FEATURE_OBSERVATION]; NSString * sensor = [feature valueForKey:FEATURE_SENSOR]; if ([observation isKindOfClass:[NSArray class]]) { NSArray * obs = (NSArray *) observation; for (NSObject * ob in obs) { NSString * newName = [NSString stringWithFormat:@"%@ :: %@", sensor, ob, nil]; if ([name isEqualToString:newName]) value = @"true"; } } else if (split != nil && [name isEqualToString:[NSString stringWithFormat:@"%@ %@", sensor, NUMERIC]]) { // NSLog (@"obs class = %@", [observation class]); if ([observation isKindOfClass:[NSNumber class]]) { NSNumber * number = (NSNumber *) observation; // NSLog (@"%@ <= %@?", number, split); if ([number doubleValue] >= [split doubleValue]) value = @"true"; } } else if ([name isEqualToString:sensor]) { value = [observation description]; } } NSDictionary * child = [treeDict valueForKey:value]; if (child != nil) return [self labelForTree:child features:features]; NSMutableDictionary * predictions = [NSMutableDictionary dictionary]; for (child in [treeDict allValues]) { if ([child isKindOfClass:[NSDictionary class]]) { NSString * prediction = [self labelForTree:child features:features]; NSNumber * count = [child valueForKey:EXAMPLE_COUNT]; if (prediction != nil) { NSNumber * predictionCount = [predictions valueForKey:prediction]; if (predictionCount == nil) predictionCount = [NSNumber numberWithInteger:0]; predictionCount = [NSNumber numberWithInteger:([predictionCount integerValue] + [count integerValue])]; [predictions setValue:predictionCount forKey:prediction]; } } } NSInteger max = 0; NSString * maxPrediction = nil; for (NSString * prediction in [predictions allKeys]) { NSInteger count = [[predictions valueForKey:prediction] integerValue]; if (count > max) { max = count; maxPrediction = prediction; } } return maxPrediction; } - (NSUInteger) nodeCountForTree:(NSDictionary *) tree { if ([tree valueForKey:LABEL_KEY] != nil) return 1; NSUInteger count = 1; for (NSObject * item in [tree allValues]) { if ([item isKindOfClass:[NSDictionary class]]) count += [self nodeCountForTree:((NSDictionary *) item)]; } return count; } - (NSUInteger) nodeDepthForTree:(NSDictionary *) tree { if ([tree valueForKey:LABEL_KEY] != nil) return 1; NSUInteger maxDepth = 0; for (NSObject * item in [tree allValues]) { if ([item isKindOfClass:[NSDictionary class]]) { NSUInteger depth = [self nodeDepthForTree:((NSDictionary *) item)]; if (depth > maxDepth) maxDepth = depth; } } return 1 + maxDepth; } - (NSString *) htmlForTree:(NSDictionary *) tree name:(NSString *) name { NSString * label = [tree valueForKey:LABEL_KEY]; if (label != nil) { if (name == nil) { NSString * html = [NSString stringWithFormat:@"
Default Value
\%@
\%@ matches
\%@
\%@ matches
", name, name, label, [tree valueForKey:EXAMPLE_COUNT], nil]; return html; } } NSDictionary * feature = [tree valueForKey:FEATURE_SENSOR]; NSString * featureDesc = [feature valueForKey:FEATURE_NAME]; NSNumber * split = [feature valueForKey:SPLIT_VALUE]; if (split != nil) featureDesc = [NSString stringWithFormat:@"%@ >= %@", featureDesc, split]; NSMutableString * html = [NSMutableString stringWithFormat:@"%@
", featureDesc , nil]; for (NSString * k in [[tree allKeys] sortedArrayUsingFunction:C45Sort context:NULL]) { NSObject * value = [tree valueForKey:k]; if ([value isKindOfClass:[NSDictionary class]] && ![k isEqualToString:FEATURE_SENSOR]) { [html appendString:@"