From 6cc1d32d75879cee5c58cf04f38a9ed76039468a Mon Sep 17 00:00:00 2001 From: badra001 Date: Thu, 29 Jan 2026 15:09:50 -0500 Subject: [PATCH] fix --- .../tag-checker/analyze-tag-data.py | 87 ++++++++++--------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py b/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py index 0f6bc4cc..45d09ff2 100755 --- a/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py +++ b/local-app/python-tools/cross-organization/tag-checker/analyze-tag-data.py @@ -12,7 +12,7 @@ from datetime import datetime from difflib import SequenceMatcher -__version__ = "1.0.7" +__version__ = "1.0.8" def get_args(): parser = argparse.ArgumentParser(description=f"AWS Tag Data Analyzer v{__version__}") @@ -20,41 +20,49 @@ def get_args(): parser.add_argument("--required-tags-file", help="CSV with mandatory TagKey list") parser.add_argument("--findings-file", nargs='+', required=True, help="Scanner CSV findings") parser.add_argument("--legacy-map", help="JSON file with {'LegacyKey': 'NewKey'} mapping") - parser.add_argument("--service-map", help="Path to aws_service_map.json (optional)") + parser.add_argument("--service-map", help="Path to aws_service_map.json") parser.add_argument("--output", help="Output prefix") parser.add_argument("--similarity-threshold", type=float, default=0.85) return parser.parse_args() def load_service_map(map_path): - """Loads mapping from local JSON if available, else returns minimal defaults.""" if map_path and os.path.exists(map_path): with open(map_path, 'r') as f: return json.load(f) - return {"ec2": "Amazon EC2", "s3": "Amazon S3", "lambda": "AWS Lambda", "iam": "AWS IAM"} + return None + +def parse_arn_details(arn): + """Extracts service and primary resource type (e.g., ec2 vpc).""" + try: + parts = arn.split(':') + if len(parts) < 6: return "unknown", "unknown" + service = parts[2] + resource_part = parts[5] + # Break at first / or : to get type (e.g., volume/vol-123 -> volume) + res_type = re.split(r'[:/]', resource_part)[0] + return service, res_type + except: + return "unknown", "unknown" def get_similarity(a, b): a_norm = re.sub(r'[:_\-\s]', '', a.lower()) b_norm = re.sub(r'[:_\-\s]', '', b.lower()) - if a_norm == b_norm: return 1.0 - return SequenceMatcher(None, a, b).ratio() + return 1.0 if a_norm == b_norm else SequenceMatcher(None, a, b).ratio() def analyze(): args = get_args() - start_ts = datetime.now() # --- LOAD METADATA --- input_tags = {} - stats = {"total": 0, "active": 0, "inactive": 0, "aws": 0, "cust": 0} + stats = {"total": 0, "active": 0, "inactive": 0} with open(args.tags_file, mode='r', encoding='utf-8-sig') as f: for row in csv.DictReader(f): - k = row.get('TagKey', '').strip().replace('"', '') + k = row.get('TagKey', '').strip() if not k: continue s = row.get('Status', 'Unknown').strip().lower() input_tags[k] = s stats["total"] += 1 if s == 'active': stats["active"] += 1 elif s == 'inactive': stats["inactive"] += 1 - if k.lower().startswith('aws:'): stats["aws"] += 1 - else: stats["cust"] += 1 required_keys = set() if args.required_tags_file: @@ -71,9 +79,9 @@ def analyze(): findings_count = Counter() tag_values = defaultdict(Counter) account_map = defaultdict(set) - resource_tags = defaultdict(dict) # ARN -> {Key: Val} - resource_info = {} # ARN -> {Acc, Reg} - service_counts = Counter() + resource_tags = defaultdict(dict) + resource_info = {} + service_distribution = Counter() files = [] for p in args.findings_file: files.extend(glob.glob(p)) @@ -90,27 +98,23 @@ def analyze(): resource_tags[arn][tag] = val resource_info[arn] = {"acc": acc, "reg": reg} - svc = arn.split(':')[2] if len(arn.split(':')) > 2 else "unknown" - service_counts[svc] = service_counts.get(svc, set()) - service_counts[svc].add(arn) + svc, r_type = parse_arn_details(arn) + service_distribution[f"{svc} {r_type}"] += 1 - # Convert service sets to counts - svc_final_counts = {k: len(v) for k, v in service_counts.items()} max_tag_len = max([len(t) for t in findings_count.keys()] + [20]) col1 = max_tag_len + 2 div = "=" * (col1 + 65) - # --- SECTION 1 & 2: SUMMARY & VALUES --- + # --- SECTIONS 1-5 (Restored) --- print(f"\n{div}\nSECTION 1: GLOBAL SUMMARY\n{div}") print(f"Input Tags: {stats['total']} | Active: {stats['active']} | Found: {len(findings_count)}") - print(f"Total Resource Hits: {sum(findings_count.values())} | Accounts with Hits: {len(set().union(*account_map.values()))}") + print(f"Total Hits: {sum(findings_count.values())} | Accounts: {len(set().union(*account_map.values()))}") print(f"\n{div}\nSECTION 2: TAG VALUE DISTRIBUTION (TOP 5)\n{div}") for tag in sorted(findings_count.keys()): vals = ", ".join([f"{v}({c})" for v, c in tag_values[tag].most_common(5)]) print(f"{tag.ljust(col1)} | {vals}") - # --- SECTION 3: SIMILARITY --- print(f"\n{div}\nSECTION 3: SUSPECTED DUPLICATES (TYPOS/CASE)\n{div}") tags = sorted(findings_count.keys()) for i in range(len(tags)): @@ -118,33 +122,32 @@ def analyze(): if get_similarity(tags[i], tags[j]) >= args.similarity_threshold: print(f"[!] {tags[i]} <-> {tags[j]}") - # --- SECTION 4: COVERAGE --- if required_keys: print(f"\n{div}\nSECTION 4: COVERAGE & COMPLIANCE\n{div}") - non_compliant = [] - for arn, tags in resource_tags.items(): - missing = required_keys - set(tags.keys()) - if missing: non_compliant.append((arn, missing)) - print(f"Total Resources: {len(resource_tags)} | Fully Compliant: {len(resource_tags)-len(non_compliant)}") - print(f"Compliance Rate: {((len(resource_tags)-len(non_compliant))/len(resource_tags)*100):.2f}%") - - # --- SECTION 5: LEGACY MAPPING --- + non_compliant = sum(1 for tags in resource_tags.values() if not (required_keys <= set(tags.keys()))) + print(f"Total Resources: {len(resource_tags)} | Non-Compliant: {non_compliant}") + print(f"Compliance Rate: {((len(resource_tags)-non_compliant)/len(resource_tags)*100):.2f}%") + if legacy_map: print(f"\n{div}\nSECTION 5: LEGACY MIGRATION STATUS\n{div}") - pending = 0 - for arn, tags in resource_tags.items(): - for leg, targ in legacy_map.items(): - if leg in tags and targ not in tags: pending += 1 - print(f"Pending Migration Tasks (Legacy Key exists, New Key missing): {pending}") + pending = sum(1 for tags in resource_tags.values() for leg, targ in legacy_map.items() if leg in tags and targ not in tags) + print(f"Pending Migration Tasks: {pending}") # --- SECTION 6: SERVICE DISTRIBUTION --- print(f"\n{div}\nSECTION 6: SERVICE DISTRIBUTION\n{div}") - print(f"{'Service Prefix':<20} | {'Friendly Name':<40} | {'Count'}") - for svc, count in sorted(svc_final_counts.items(), key=lambda x: x[1], reverse=True): - print(f"{svc:<20} | {service_map.get(svc, 'Unknown').ljust(40)} | {count}") - - mem_mb = round(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024, 2) - print(f"\n{div}\nPeak Memory Usage: {mem_mb} MB\n{div}") + + if service_map: + print(f"{'Service & Type':<30} | {'Friendly Name':<40} | {'Count'}") + print("-" * (30 + 40 + 15)) + for key in sorted(service_distribution.keys()): + prefix = key.split(' ')[0] + friendly = service_map.get(prefix, 'Unknown') + print(f"{key:<30} | {friendly:<40} | {service_distribution[key]}") + else: + print(f"{'Service & Type':<30} | {'Count'}") + print("-" * (30 + 15)) + for key in sorted(service_distribution.keys()): + print(f"{key:<30} | {service_distribution[key]}") if __name__ == "__main__": analyze()