Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
badra001 committed Jan 29, 2026
1 parent 78ce389 commit 6cc1d32
Showing 1 changed file with 45 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,49 +12,57 @@
from datetime import datetime
from difflib import SequenceMatcher

__version__ = "1.0.7"
__version__ = "1.0.8"

def get_args():
parser = argparse.ArgumentParser(description=f"AWS Tag Data Analyzer v{__version__}")
parser.add_argument("--tags-file", required=True, help="Original CSV with TagKey, Status, etc.")
parser.add_argument("--required-tags-file", help="CSV with mandatory TagKey list")
parser.add_argument("--findings-file", nargs='+', required=True, help="Scanner CSV findings")
parser.add_argument("--legacy-map", help="JSON file with {'LegacyKey': 'NewKey'} mapping")
parser.add_argument("--service-map", help="Path to aws_service_map.json (optional)")
parser.add_argument("--service-map", help="Path to aws_service_map.json")
parser.add_argument("--output", help="Output prefix")
parser.add_argument("--similarity-threshold", type=float, default=0.85)
return parser.parse_args()

def load_service_map(map_path):
"""Loads mapping from local JSON if available, else returns minimal defaults."""
if map_path and os.path.exists(map_path):
with open(map_path, 'r') as f: return json.load(f)
return {"ec2": "Amazon EC2", "s3": "Amazon S3", "lambda": "AWS Lambda", "iam": "AWS IAM"}
return None

def parse_arn_details(arn):
"""Extracts service and primary resource type (e.g., ec2 vpc)."""
try:
parts = arn.split(':')
if len(parts) < 6: return "unknown", "unknown"
service = parts[2]
resource_part = parts[5]
# Break at first / or : to get type (e.g., volume/vol-123 -> volume)
res_type = re.split(r'[:/]', resource_part)[0]
return service, res_type
except:
return "unknown", "unknown"

def get_similarity(a, b):
a_norm = re.sub(r'[:_\-\s]', '', a.lower())
b_norm = re.sub(r'[:_\-\s]', '', b.lower())
if a_norm == b_norm: return 1.0
return SequenceMatcher(None, a, b).ratio()
return 1.0 if a_norm == b_norm else SequenceMatcher(None, a, b).ratio()

def analyze():
args = get_args()
start_ts = datetime.now()

# --- LOAD METADATA ---
input_tags = {}
stats = {"total": 0, "active": 0, "inactive": 0, "aws": 0, "cust": 0}
stats = {"total": 0, "active": 0, "inactive": 0}
with open(args.tags_file, mode='r', encoding='utf-8-sig') as f:
for row in csv.DictReader(f):
k = row.get('TagKey', '').strip().replace('"', '')
k = row.get('TagKey', '').strip()
if not k: continue
s = row.get('Status', 'Unknown').strip().lower()
input_tags[k] = s
stats["total"] += 1
if s == 'active': stats["active"] += 1
elif s == 'inactive': stats["inactive"] += 1
if k.lower().startswith('aws:'): stats["aws"] += 1
else: stats["cust"] += 1

required_keys = set()
if args.required_tags_file:
Expand All @@ -71,9 +79,9 @@ def analyze():
findings_count = Counter()
tag_values = defaultdict(Counter)
account_map = defaultdict(set)
resource_tags = defaultdict(dict) # ARN -> {Key: Val}
resource_info = {} # ARN -> {Acc, Reg}
service_counts = Counter()
resource_tags = defaultdict(dict)
resource_info = {}
service_distribution = Counter()

files = []
for p in args.findings_file: files.extend(glob.glob(p))
Expand All @@ -90,61 +98,56 @@ def analyze():
resource_tags[arn][tag] = val
resource_info[arn] = {"acc": acc, "reg": reg}

svc = arn.split(':')[2] if len(arn.split(':')) > 2 else "unknown"
service_counts[svc] = service_counts.get(svc, set())
service_counts[svc].add(arn)
svc, r_type = parse_arn_details(arn)
service_distribution[f"{svc} {r_type}"] += 1

# Convert service sets to counts
svc_final_counts = {k: len(v) for k, v in service_counts.items()}
max_tag_len = max([len(t) for t in findings_count.keys()] + [20])
col1 = max_tag_len + 2
div = "=" * (col1 + 65)

# --- SECTION 1 & 2: SUMMARY & VALUES ---
# --- SECTIONS 1-5 (Restored) ---
print(f"\n{div}\nSECTION 1: GLOBAL SUMMARY\n{div}")
print(f"Input Tags: {stats['total']} | Active: {stats['active']} | Found: {len(findings_count)}")
print(f"Total Resource Hits: {sum(findings_count.values())} | Accounts with Hits: {len(set().union(*account_map.values()))}")
print(f"Total Hits: {sum(findings_count.values())} | Accounts: {len(set().union(*account_map.values()))}")

print(f"\n{div}\nSECTION 2: TAG VALUE DISTRIBUTION (TOP 5)\n{div}")
for tag in sorted(findings_count.keys()):
vals = ", ".join([f"{v}({c})" for v, c in tag_values[tag].most_common(5)])
print(f"{tag.ljust(col1)} | {vals}")

# --- SECTION 3: SIMILARITY ---
print(f"\n{div}\nSECTION 3: SUSPECTED DUPLICATES (TYPOS/CASE)\n{div}")
tags = sorted(findings_count.keys())
for i in range(len(tags)):
for j in range(i + 1, len(tags)):
if get_similarity(tags[i], tags[j]) >= args.similarity_threshold:
print(f"[!] {tags[i]} <-> {tags[j]}")

# --- SECTION 4: COVERAGE ---
if required_keys:
print(f"\n{div}\nSECTION 4: COVERAGE & COMPLIANCE\n{div}")
non_compliant = []
for arn, tags in resource_tags.items():
missing = required_keys - set(tags.keys())
if missing: non_compliant.append((arn, missing))
print(f"Total Resources: {len(resource_tags)} | Fully Compliant: {len(resource_tags)-len(non_compliant)}")
print(f"Compliance Rate: {((len(resource_tags)-len(non_compliant))/len(resource_tags)*100):.2f}%")

# --- SECTION 5: LEGACY MAPPING ---
non_compliant = sum(1 for tags in resource_tags.values() if not (required_keys <= set(tags.keys())))
print(f"Total Resources: {len(resource_tags)} | Non-Compliant: {non_compliant}")
print(f"Compliance Rate: {((len(resource_tags)-non_compliant)/len(resource_tags)*100):.2f}%")

if legacy_map:
print(f"\n{div}\nSECTION 5: LEGACY MIGRATION STATUS\n{div}")
pending = 0
for arn, tags in resource_tags.items():
for leg, targ in legacy_map.items():
if leg in tags and targ not in tags: pending += 1
print(f"Pending Migration Tasks (Legacy Key exists, New Key missing): {pending}")
pending = sum(1 for tags in resource_tags.values() for leg, targ in legacy_map.items() if leg in tags and targ not in tags)
print(f"Pending Migration Tasks: {pending}")

# --- SECTION 6: SERVICE DISTRIBUTION ---
print(f"\n{div}\nSECTION 6: SERVICE DISTRIBUTION\n{div}")
print(f"{'Service Prefix':<20} | {'Friendly Name':<40} | {'Count'}")
for svc, count in sorted(svc_final_counts.items(), key=lambda x: x[1], reverse=True):
print(f"{svc:<20} | {service_map.get(svc, 'Unknown').ljust(40)} | {count}")

mem_mb = round(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024, 2)
print(f"\n{div}\nPeak Memory Usage: {mem_mb} MB\n{div}")

if service_map:
print(f"{'Service & Type':<30} | {'Friendly Name':<40} | {'Count'}")
print("-" * (30 + 40 + 15))
for key in sorted(service_distribution.keys()):
prefix = key.split(' ')[0]
friendly = service_map.get(prefix, 'Unknown')
print(f"{key:<30} | {friendly:<40} | {service_distribution[key]}")
else:
print(f"{'Service & Type':<30} | {'Count'}")
print("-" * (30 + 15))
for key in sorted(service_distribution.keys()):
print(f"{key:<30} | {service_distribution[key]}")

if __name__ == "__main__":
analyze()

0 comments on commit 6cc1d32

Please sign in to comment.