put back sections

terraform · Jan 30, 2026 · 802f8fa · 802f8fa
1 parent 5d15d5b
commit 802f8fa
Showing 1 changed file with 84 additions and 68 deletions.
diff --git a/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py b/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py
@@ -12,46 +12,41 @@
 from datetime import datetime
 from difflib import SequenceMatcher
 
-__version__ = "1.1.11"
+__version__ = "1.1.12"
 
-# Services that do NOT have a resource type in the 6th ARN field (Flat ARNs)
+# Flat services list (Services that do NOT use a type/name hierarchy in ARNs)
 SERVICES_WITHOUT_TYPES = [
-    "s3", "sns", "sqs", "codepipeline", "codebuild", 
-    "cloudwatch", "events", "logs", "states", "athena", "glue"
+    "s3", "sns", "sqs", "codepipeline", "codebuild", "cloudwatch", 
+    "events", "logs", "states", "athena", "glue", "route53"
 ]
 
 def get_args():
     parser = argparse.ArgumentParser(description=f"AWS Tag Data Analyzer v{__version__}")
-    parser.add_argument("--tags-file", required=True, help="Original CSV with TagKey, Status, etc.")
-    parser.add_argument("--required-tags-file", help="CSV with mandatory TagKey list")
-    parser.add_argument("--findings-file", nargs='+', required=True, help="Scanner CSV findings")
-    parser.add_argument("--legacy-map", help="JSON file with {'LegacyKey': 'NewKey'} mapping")
-    parser.add_argument("--service-map", help="Path to aws_service_map.json")
-    parser.add_argument("--output", help="Output prefix")
-    parser.add_argument("--similarity-threshold", type=float, default=0.85)
+    parser.add_argument("--tags-file", required=True, help="Full inventory CSV with TagKey and Status")
+    parser.add_argument("--findings-file", nargs='+', required=True, help="Scanner CSV findings (supports wildcards)")
+    parser.add_argument("--required-tags-file", help="CSV with mandatory TagKey list for Section 4")
+    parser.add_argument("--legacy-map", help="JSON file for Section 5 migration {'OldKey': 'NewKey'}")
+    parser.add_argument("--service-map", default="aws_service_map.json", help="Path to service friendly name JSON")
+    parser.add_argument("--similarity-threshold", type=float, default=0.85, help="Fuzzy match threshold for Section 3")
+    parser.add_argument("--output", help="Prefix for exported files")
     return parser.parse_args()
 
-def load_service_map(map_path):
-    if map_path and os.path.exists(map_path):
+def load_json(path):
+    if path and os.path.exists(path):
         try:
-            with open(map_path, 'r') as f: return json.load(f)
+            with open(path, 'r') as f: return json.load(f)
         except: return None
     return None
 
 def parse_arn_details(arn):
-    """Extracts service and resource type, handling services with no type segment."""
     try:
         parts = arn.split(':')
         if len(parts) < 6: return "unknown", ""
-        service = parts[2]
-        resource_part = parts[5]
-        if service in SERVICES_WITHOUT_TYPES:
-            return service, ""
-        # Split at first / or : to isolate the type
+        service, resource_part = parts[2], parts[5]
+        if service in SERVICES_WITHOUT_TYPES: return service, ""
         res_type = re.split(r'[:/]', resource_part)[0]
         return service, res_type
-    except:
-        return "unknown", ""
+    except: return "unknown", ""
 
 def get_similarity(a, b):
     a_norm = re.sub(r'[:_\-\s]', '', a.lower())
@@ -62,37 +57,33 @@ def analyze():
     args = get_args()
 
     # --- LOAD METADATA ---
-    input_tags = {}
-    stats = {"total": 0, "active": 0}
+    input_tags, stats = {}, {"total": 0, "active": 0, "inactive": 0, "aws_gen": 0}
     with open(args.tags_file, mode='r', encoding='utf-8-sig') as f:
         for row in csv.DictReader(f):
             k = row.get('TagKey', '').strip()
             if not k: continue
-            input_tags[k] = row.get('Status', 'Unknown').strip().lower()
+            s = row.get('Status', 'Unknown').strip().lower()
+            input_tags[k] = s
             stats["total"] += 1
-            if input_tags[k] == 'active': stats["active"] += 1
+            if s == 'active': stats["active"] += 1
+            elif s == 'inactive': stats["inactive"] += 1
+            if k.lower().startswith('aws:'): stats["aws_gen"] += 1
 
     required_keys = set()
-    if args.required_tags_file:
+    if args.required_tags_file and os.path.exists(args.required_tags_file):
         with open(args.required_tags_file, 'r', encoding='utf-8-sig') as f:
             required_keys = {row['TagKey'].strip() for row in csv.DictReader(f) if row.get('TagKey')}
 
-    legacy_map = {}
-    if args.legacy_map:
-        with open(args.legacy_map, 'r') as f: legacy_map = json.load(f)
+    legacy_map = load_json(args.legacy_map)
+    service_map = load_json(args.service_map)
 
-    service_map = load_service_map(args.service_map)
-
-    # --- PROCESS FINDINGS ---
-    findings_count = Counter()
-    tag_values = defaultdict(Counter)
-    account_map = defaultdict(set)
-    resource_tags = defaultdict(dict) 
+    # --- DATA AGGREGATION ---
+    findings_count, tag_values, account_map = Counter(), defaultdict(Counter), defaultdict(set)
+    resource_tags, resource_info = defaultdict(dict), {}
     service_resource_tracking = defaultdict(set)
 
     files = []
     for p in args.findings_file: files.extend(glob.glob(p))
-
     for file in files:
         with open(file, mode='r', encoding='utf-8') as f:
             for row in csv.DictReader(f):
@@ -101,47 +92,72 @@ def analyze():
                 tag_values[tag][val] += 1
                 account_map[tag].add(row['account_id'])
                 resource_tags[arn][tag] = val
+                resource_info[arn] = {"acc": row['account_id'], "reg": row['region']}
                 svc, r_type = parse_arn_details(arn)
-                display_name = f"{svc} {r_type}".strip()
-                service_resource_tracking[display_name].add(arn)
+                service_resource_tracking[f"{svc} {r_type}".strip()].add(arn)
 
-    # Dynamic column widths
-    max_tag_len = max([len(t) for t in findings_count.keys()] + [10], default=10)
+    tag_list = sorted(findings_count.keys())
+    max_tag_len = max([len(t) for t in tag_list] + [15], default=15)
     max_svc_len = max([len(s) for s in service_resource_tracking.keys()] + [15], default=15)
-
-    col1_tag = max_tag_len + 2
-    col1_svc = max_svc_len + 2
-    div = "=" * (max(col1_tag, col1_svc) + 65)
+    col1_tag, col1_svc = max_tag_len + 2, max_svc_len + 2
+    div = "=" * (max(col1_tag, col1_svc) + 70)
 
-    # --- SECTION 1: GLOBAL SUMMARY ---
+    # --- SECTION 1: SUMMARY ---
     print(f"\n{div}\nSECTION 1: GLOBAL SUMMARY\n{div}")
-    print(f"Input Tags: {stats['total']} | Active: {stats['active']} | Found: {len(findings_count)}")
-    print(f"Total Unique Resources Found: {len(resource_tags)}")
-    print(f"Total Resource Hits: {sum(findings_count.values())} | Accounts: {len(set().union(*account_map.values()))}")
+    print(f"Total Unique Resources: {len(resource_tags)} | Total Tag Hits: {sum(findings_count.values())}")
+    print(f"Active Tags: {stats['active']} | AWS Reserved: {stats['aws_gen']} | Unique Accounts: {len(set().union(*account_map.values()))}")
 
     # --- SECTION 2: VALUES ---
     print(f"\n{div}\nSECTION 2: TAG VALUE DISTRIBUTION (TOP 5)\n{div}")
-    for tag in sorted(findings_count.keys()):
-        vals = ", ".join([f"{v}({c})" for v, c in tag_values[tag].most_common(5)])
-        print(f"{tag.ljust(col1_tag)} | {vals}")
+    for tag in tag_list:
+        status = input_tags.get(tag, "Not Listed")
+        vals = ", ".join([f"{v if v else '<EMPTY>'}({c})" for v, c in tag_values[tag].most_common(5)])
+        # Case Inconsistency Check
+        raw_vals = [v.lower() for v in tag_values[tag].keys()]
+        if len(set(raw_vals)) < len(tag_values[tag]):
+            vals += " [!] CASE VARIATIONS DETECTED"
+        print(f"{tag.ljust(col1_tag)} | Status: {status:<10} | {vals}")
+
+    # --- SECTION 3: SIMILARITY ---
+    print(f"\n{div}\nSECTION 3: SUSPECTED TAG KEY DUPLICATES (FUZZY MATCH)\n{div}")
+    for i in range(len(tag_list)):
+        for j in range(i + 1, len(tag_list)):
+            if get_similarity(tag_list[i], tag_list[j]) >= args.similarity_threshold:
+                print(f"[!] Similarity: {tag_list[i].ljust(25)} <-> {tag_list[j]}")
+
+    # --- SECTION 4: COMPLIANCE ---
+    if required_keys:
+        print(f"\n{div}\nSECTION 4: COVERAGE & COMPLIANCE (REQUIRED TAGS)\n{div}")
+        non_compliant_map = defaultdict(int)
+        non_compliant_arns = []
+        for arn, tags in resource_tags.items():
+            missing = required_keys - set(tags.keys())
+            if missing:
+                svc, _ = parse_arn_details(arn)
+                non_compliant_map[svc] += 1
+                non_compliant_arns.append((arn, missing))
+        rate = ((len(resource_tags)-len(non_compliant_arns))/len(resource_tags)*100) if resource_tags else 0
+        print(f"Overall Compliance: {rate:.2f}% | Non-Compliant Count: {len(non_compliant_arns)}")
+        print(f"{'-'*40}")
+        print("Non-Compliance by Service:")
+        for s, count in sorted(non_compliant_map.items(), key=lambda x: x[1], reverse=True):
+            print(f"  - {s:<15}: {count} resources missing tags")
+
+    # --- SECTION 5: LEGACY MIGRATION ---
+    if legacy_map:
+        print(f"\n{div}\nSECTION 5: LEGACY MIGRATION TASKS\n{div}")
+        pending = sum(1 for tags in resource_tags.values() for leg, targ in legacy_map.items() if leg in tags and targ not in tags)
+        print(f"Resources Pending Migration (Legacy Key exists, New Key missing): {pending}")
 
     # --- SECTION 6: SERVICE DISTRIBUTION ---
     print(f"\n{div}\nSECTION 6: SERVICE DISTRIBUTION (UNIQUE RESOURCES)\n{div}")
-
-    if service_map:
-        header = f"{'Service & Type'.ljust(col1_svc)} | {'Friendly Name':<40} | {'Count'}"
-        print(header)
-        print("-" * len(header))
-        for key in sorted(service_resource_tracking.keys()):
-            prefix = key.split(' ')[0]
-            friendly = service_map.get(prefix, 'Unknown')
-            print(f"{key.ljust(col1_svc)} | {friendly:<40} | {len(service_resource_tracking[key])}")
-    else:
-        header = f"{'Service & Type'.ljust(col1_svc)} | {'Count'}"
-        print(header)
-        print("-" * len(header))
-        for key in sorted(service_resource_tracking.keys()):
-            print(f"{key.ljust(col1_svc)} | {len(service_resource_tracking[key])}")
+    svc_sorted = sorted(service_resource_tracking.keys())
+    hdr = f"{'Service & Type'.ljust(col1_svc)} | {'Friendly Name' if service_map else '' :<40} | {'Count'}"
+    print(f"{hdr}\n{'-'*len(hdr)}")
+    for s in svc_sorted:
+        prefix = s.split(' ')[0]
+        friendly = service_map.get(prefix, 'Unknown') if service_map else ""
+        print(f"{s.ljust(col1_svc)} | {friendly.ljust(40) if service_map else ''} | {len(service_resource_tracking[s])}")
 
 if __name__ == "__main__":
     analyze()