From dbd0fa6e61bdcc0e084a24fbd443aad59397699d Mon Sep 17 00:00:00 2001 From: badra001 Date: Thu, 29 Jan 2026 15:13:29 -0500 Subject: [PATCH] unique resources only --- .../tag-checker/analyze-tag-data.py | 52 +++++++------------ 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py b/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py index c2cae1a1..caa7081f 100755 --- a/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py +++ b/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py @@ -12,13 +12,13 @@ from datetime import datetime from difflib import SequenceMatcher -__version__ = "1.0.9" +__version__ = "1.1.10" # Services that do NOT have a resource type in the 6th ARN field -# They use: arn:aws:service:region:account:resource-name +# Format: arn:aws:service:region:account:resource-name SERVICES_WITHOUT_TYPES = [ "s3", "sns", "sqs", "codepipeline", "codebuild", - "cloudwatch", "events", "logs", "sns", "states" + "cloudwatch", "events", "logs", "states", "athena", "glue" ] def get_args(): @@ -48,12 +48,10 @@ def parse_arn_details(arn): service = parts[2] resource_part = parts[5] - # If the service is known to skip the type field, just return the service if service in SERVICES_WITHOUT_TYPES: return service, "" - # Otherwise, find the type (break at first / or :) - # Example: 'ec2' 'instance/i-123' -> 'ec2' 'instance' + # Break at first / or : to get type (e.g., instance/i-123 -> instance) res_type = re.split(r'[:/]', resource_part)[0] return service, res_type except: @@ -96,7 +94,9 @@ def analyze(): tag_values = defaultdict(Counter) account_map = defaultdict(set) resource_tags = defaultdict(dict) - service_distribution = Counter() + + # Track unique resources per service: { "service name": set(arns) } + service_resource_tracking = defaultdict(set) files = [] for p in args.findings_file: files.extend(glob.glob(p)) @@ -114,55 +114,39 @@ def analyze(): svc, r_type = parse_arn_details(arn) display_name = f"{svc} {r_type}".strip() - service_distribution[display_name] += 1 + service_resource_tracking[display_name].add(arn) max_tag_len = max([len(t) for t in findings_count.keys()] + [20]) col1 = max_tag_len + 2 div = "=" * (col1 + 65) - # --- SECTIONS 1-5 (Summary, Values, Similarity, Compliance, Legacy) --- + # --- SECTION 1: GLOBAL SUMMARY --- print(f"\n{div}\nSECTION 1: GLOBAL SUMMARY\n{div}") print(f"Input Tags: {stats['total']} | Active: {stats['active']} | Found: {len(findings_count)}") - print(f"Total Hits: {sum(findings_count.values())} | Accounts: {len(set().union(*account_map.values()))}") + print(f"Total Unique Resources Found: {len(resource_tags)}") + print(f"Total Resource Hits: {sum(findings_count.values())} | Accounts: {len(set().union(*account_map.values()))}") + # --- SECTION 2: VALUES --- print(f"\n{div}\nSECTION 2: TAG VALUE DISTRIBUTION (TOP 5)\n{div}") for tag in sorted(findings_count.keys()): vals = ", ".join([f"{v}({c})" for v, c in tag_values[tag].most_common(5)]) print(f"{tag.ljust(col1)} | {vals}") - print(f"\n{div}\nSECTION 3: SUSPECTED DUPLICATES (TYPOS/CASE)\n{div}") - tags_list = sorted(findings_count.keys()) - for i in range(len(tags_list)): - for j in range(i + 1, len(tags_list)): - if get_similarity(tags_list[i], tags_list[j]) >= args.similarity_threshold: - print(f"[!] {tags_list[i]} <-> {tags_list[j]}") - - if required_keys: - print(f"\n{div}\nSECTION 4: COVERAGE & COMPLIANCE\n{div}") - non_compliant = sum(1 for tags in resource_tags.values() if not (required_keys <= set(tags.keys()))) - print(f"Total Resources: {len(resource_tags)} | Non-Compliant: {non_compliant}") - print(f"Compliance Rate: {((len(resource_tags)-non_compliant)/len(resource_tags)*100):.2f}%") - - if legacy_map: - print(f"\n{div}\nSECTION 5: LEGACY MIGRATION STATUS\n{div}") - pending = sum(1 for tags in resource_tags.values() for leg, targ in legacy_map.items() if leg in tags and targ not in tags) - print(f"Pending Migration Tasks: {pending}") - - # --- SECTION 6: SERVICE DISTRIBUTION --- - print(f"\n{div}\nSECTION 6: SERVICE DISTRIBUTION\n{div}") + # --- SECTION 6: SERVICE DISTRIBUTION (UNIQUE RESOURCE COUNT) --- + print(f"\n{div}\nSECTION 6: SERVICE DISTRIBUTION (UNIQUE RESOURCES)\n{div}") if service_map: print(f"{'Service & Type':<30} | {'Friendly Name':<40} | {'Count'}") print("-" * (30 + 40 + 15)) - for key in sorted(service_distribution.keys()): + for key in sorted(service_resource_tracking.keys()): prefix = key.split(' ')[0] friendly = service_map.get(prefix, 'Unknown') - print(f"{key:<30} | {friendly:<40} | {service_distribution[key]}") + print(f"{key:<30} | {friendly:<40} | {len(service_resource_tracking[key])}") else: print(f"{'Service & Type':<30} | {'Count'}") print("-" * (30 + 15)) - for key in sorted(service_distribution.keys()): - print(f"{key:<30} | {service_distribution[key]}") + for key in sorted(service_resource_tracking.keys()): + print(f"{key:<30} | {len(service_resource_tracking[key])}") if __name__ == "__main__": analyze()