diff --git a/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py b/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py index b257e250..f92bf186 100755 --- a/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py +++ b/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py @@ -12,46 +12,41 @@ from datetime import datetime from difflib import SequenceMatcher -__version__ = "1.1.11" +__version__ = "1.1.12" -# Services that do NOT have a resource type in the 6th ARN field (Flat ARNs) +# Flat services list (Services that do NOT use a type/name hierarchy in ARNs) SERVICES_WITHOUT_TYPES = [ - "s3", "sns", "sqs", "codepipeline", "codebuild", - "cloudwatch", "events", "logs", "states", "athena", "glue" + "s3", "sns", "sqs", "codepipeline", "codebuild", "cloudwatch", + "events", "logs", "states", "athena", "glue", "route53" ] def get_args(): parser = argparse.ArgumentParser(description=f"AWS Tag Data Analyzer v{__version__}") - parser.add_argument("--tags-file", required=True, help="Original CSV with TagKey, Status, etc.") - parser.add_argument("--required-tags-file", help="CSV with mandatory TagKey list") - parser.add_argument("--findings-file", nargs='+', required=True, help="Scanner CSV findings") - parser.add_argument("--legacy-map", help="JSON file with {'LegacyKey': 'NewKey'} mapping") - parser.add_argument("--service-map", help="Path to aws_service_map.json") - parser.add_argument("--output", help="Output prefix") - parser.add_argument("--similarity-threshold", type=float, default=0.85) + parser.add_argument("--tags-file", required=True, help="Full inventory CSV with TagKey and Status") + parser.add_argument("--findings-file", nargs='+', required=True, help="Scanner CSV findings (supports wildcards)") + parser.add_argument("--required-tags-file", help="CSV with mandatory TagKey list for Section 4") + parser.add_argument("--legacy-map", help="JSON file for Section 5 migration {'OldKey': 'NewKey'}") + parser.add_argument("--service-map", default="aws_service_map.json", help="Path to service friendly name JSON") + parser.add_argument("--similarity-threshold", type=float, default=0.85, help="Fuzzy match threshold for Section 3") + parser.add_argument("--output", help="Prefix for exported files") return parser.parse_args() -def load_service_map(map_path): - if map_path and os.path.exists(map_path): +def load_json(path): + if path and os.path.exists(path): try: - with open(map_path, 'r') as f: return json.load(f) + with open(path, 'r') as f: return json.load(f) except: return None return None def parse_arn_details(arn): - """Extracts service and resource type, handling services with no type segment.""" try: parts = arn.split(':') if len(parts) < 6: return "unknown", "" - service = parts[2] - resource_part = parts[5] - if service in SERVICES_WITHOUT_TYPES: - return service, "" - # Split at first / or : to isolate the type + service, resource_part = parts[2], parts[5] + if service in SERVICES_WITHOUT_TYPES: return service, "" res_type = re.split(r'[:/]', resource_part)[0] return service, res_type - except: - return "unknown", "" + except: return "unknown", "" def get_similarity(a, b): a_norm = re.sub(r'[:_\-\s]', '', a.lower()) @@ -62,37 +57,33 @@ def analyze(): args = get_args() # --- LOAD METADATA --- - input_tags = {} - stats = {"total": 0, "active": 0} + input_tags, stats = {}, {"total": 0, "active": 0, "inactive": 0, "aws_gen": 0} with open(args.tags_file, mode='r', encoding='utf-8-sig') as f: for row in csv.DictReader(f): k = row.get('TagKey', '').strip() if not k: continue - input_tags[k] = row.get('Status', 'Unknown').strip().lower() + s = row.get('Status', 'Unknown').strip().lower() + input_tags[k] = s stats["total"] += 1 - if input_tags[k] == 'active': stats["active"] += 1 + if s == 'active': stats["active"] += 1 + elif s == 'inactive': stats["inactive"] += 1 + if k.lower().startswith('aws:'): stats["aws_gen"] += 1 required_keys = set() - if args.required_tags_file: + if args.required_tags_file and os.path.exists(args.required_tags_file): with open(args.required_tags_file, 'r', encoding='utf-8-sig') as f: required_keys = {row['TagKey'].strip() for row in csv.DictReader(f) if row.get('TagKey')} - legacy_map = {} - if args.legacy_map: - with open(args.legacy_map, 'r') as f: legacy_map = json.load(f) + legacy_map = load_json(args.legacy_map) + service_map = load_json(args.service_map) - service_map = load_service_map(args.service_map) - - # --- PROCESS FINDINGS --- - findings_count = Counter() - tag_values = defaultdict(Counter) - account_map = defaultdict(set) - resource_tags = defaultdict(dict) + # --- DATA AGGREGATION --- + findings_count, tag_values, account_map = Counter(), defaultdict(Counter), defaultdict(set) + resource_tags, resource_info = defaultdict(dict), {} service_resource_tracking = defaultdict(set) files = [] for p in args.findings_file: files.extend(glob.glob(p)) - for file in files: with open(file, mode='r', encoding='utf-8') as f: for row in csv.DictReader(f): @@ -101,47 +92,72 @@ def analyze(): tag_values[tag][val] += 1 account_map[tag].add(row['account_id']) resource_tags[arn][tag] = val + resource_info[arn] = {"acc": row['account_id'], "reg": row['region']} svc, r_type = parse_arn_details(arn) - display_name = f"{svc} {r_type}".strip() - service_resource_tracking[display_name].add(arn) + service_resource_tracking[f"{svc} {r_type}".strip()].add(arn) - # Dynamic column widths - max_tag_len = max([len(t) for t in findings_count.keys()] + [10], default=10) + tag_list = sorted(findings_count.keys()) + max_tag_len = max([len(t) for t in tag_list] + [15], default=15) max_svc_len = max([len(s) for s in service_resource_tracking.keys()] + [15], default=15) - - col1_tag = max_tag_len + 2 - col1_svc = max_svc_len + 2 - div = "=" * (max(col1_tag, col1_svc) + 65) + col1_tag, col1_svc = max_tag_len + 2, max_svc_len + 2 + div = "=" * (max(col1_tag, col1_svc) + 70) - # --- SECTION 1: GLOBAL SUMMARY --- + # --- SECTION 1: SUMMARY --- print(f"\n{div}\nSECTION 1: GLOBAL SUMMARY\n{div}") - print(f"Input Tags: {stats['total']} | Active: {stats['active']} | Found: {len(findings_count)}") - print(f"Total Unique Resources Found: {len(resource_tags)}") - print(f"Total Resource Hits: {sum(findings_count.values())} | Accounts: {len(set().union(*account_map.values()))}") + print(f"Total Unique Resources: {len(resource_tags)} | Total Tag Hits: {sum(findings_count.values())}") + print(f"Active Tags: {stats['active']} | AWS Reserved: {stats['aws_gen']} | Unique Accounts: {len(set().union(*account_map.values()))}") # --- SECTION 2: VALUES --- print(f"\n{div}\nSECTION 2: TAG VALUE DISTRIBUTION (TOP 5)\n{div}") - for tag in sorted(findings_count.keys()): - vals = ", ".join([f"{v}({c})" for v, c in tag_values[tag].most_common(5)]) - print(f"{tag.ljust(col1_tag)} | {vals}") + for tag in tag_list: + status = input_tags.get(tag, "Not Listed") + vals = ", ".join([f"{v if v else ''}({c})" for v, c in tag_values[tag].most_common(5)]) + # Case Inconsistency Check + raw_vals = [v.lower() for v in tag_values[tag].keys()] + if len(set(raw_vals)) < len(tag_values[tag]): + vals += " [!] CASE VARIATIONS DETECTED" + print(f"{tag.ljust(col1_tag)} | Status: {status:<10} | {vals}") + + # --- SECTION 3: SIMILARITY --- + print(f"\n{div}\nSECTION 3: SUSPECTED TAG KEY DUPLICATES (FUZZY MATCH)\n{div}") + for i in range(len(tag_list)): + for j in range(i + 1, len(tag_list)): + if get_similarity(tag_list[i], tag_list[j]) >= args.similarity_threshold: + print(f"[!] Similarity: {tag_list[i].ljust(25)} <-> {tag_list[j]}") + + # --- SECTION 4: COMPLIANCE --- + if required_keys: + print(f"\n{div}\nSECTION 4: COVERAGE & COMPLIANCE (REQUIRED TAGS)\n{div}") + non_compliant_map = defaultdict(int) + non_compliant_arns = [] + for arn, tags in resource_tags.items(): + missing = required_keys - set(tags.keys()) + if missing: + svc, _ = parse_arn_details(arn) + non_compliant_map[svc] += 1 + non_compliant_arns.append((arn, missing)) + rate = ((len(resource_tags)-len(non_compliant_arns))/len(resource_tags)*100) if resource_tags else 0 + print(f"Overall Compliance: {rate:.2f}% | Non-Compliant Count: {len(non_compliant_arns)}") + print(f"{'-'*40}") + print("Non-Compliance by Service:") + for s, count in sorted(non_compliant_map.items(), key=lambda x: x[1], reverse=True): + print(f" - {s:<15}: {count} resources missing tags") + + # --- SECTION 5: LEGACY MIGRATION --- + if legacy_map: + print(f"\n{div}\nSECTION 5: LEGACY MIGRATION TASKS\n{div}") + pending = sum(1 for tags in resource_tags.values() for leg, targ in legacy_map.items() if leg in tags and targ not in tags) + print(f"Resources Pending Migration (Legacy Key exists, New Key missing): {pending}") # --- SECTION 6: SERVICE DISTRIBUTION --- print(f"\n{div}\nSECTION 6: SERVICE DISTRIBUTION (UNIQUE RESOURCES)\n{div}") - - if service_map: - header = f"{'Service & Type'.ljust(col1_svc)} | {'Friendly Name':<40} | {'Count'}" - print(header) - print("-" * len(header)) - for key in sorted(service_resource_tracking.keys()): - prefix = key.split(' ')[0] - friendly = service_map.get(prefix, 'Unknown') - print(f"{key.ljust(col1_svc)} | {friendly:<40} | {len(service_resource_tracking[key])}") - else: - header = f"{'Service & Type'.ljust(col1_svc)} | {'Count'}" - print(header) - print("-" * len(header)) - for key in sorted(service_resource_tracking.keys()): - print(f"{key.ljust(col1_svc)} | {len(service_resource_tracking[key])}") + svc_sorted = sorted(service_resource_tracking.keys()) + hdr = f"{'Service & Type'.ljust(col1_svc)} | {'Friendly Name' if service_map else '' :<40} | {'Count'}" + print(f"{hdr}\n{'-'*len(hdr)}") + for s in svc_sorted: + prefix = s.split(' ')[0] + friendly = service_map.get(prefix, 'Unknown') if service_map else "" + print(f"{s.ljust(col1_svc)} | {friendly.ljust(40) if service_map else ''} | {len(service_resource_tracking[s])}") if __name__ == "__main__": analyze()