Reputation: 65
I am going to calculate bandwidth accessed by domain and need to know how many times domain got hit. I was able to calculate bandwidth but not sure how to count occurrence of domain in logs. Any idea would be great help. Thank you so much for help in advance.
Code:
import os
import re
from collections import defaultdict
import string
merged_logs = []
line=[]
dict = defaultdict(int)
bandwidth = 0
path = ["/var/logs/"]
for i in path:
for filename in os.listdir(i):
with open(os.path.join(i, filename), 'r') as filedata:
merged_logs += filedata.readlines()
for line in merged_logs:
line_split = line.split(" ")
start = "CONNECT "
end = " -"
domain_str = line[line.find(start)+len(start):line.find(end)]
if domain_str.find("/")>0:
domain_split = domain_str.split("/")
domain = domain_split[0]
if len(line_split)==10:
bandwidth = line_split[3]
if len(line_split)==11:
bandwidth = line_split[4]
else:
domain = domain_str
if len(line_split)==10:
bandwidth = line_split[3]
if len(line_split)==11:
bandwidth = line_split[4]
if domain not in dict:
dict[domain] = int(bandwidth)
else:
dict[domain] += int(bandwidth)
for key, value in dict.items():
print key, (value * 2 )/(1024 * 1024)
Example Log file under /var/logs contains following lines:
1569935790.563 1010 192.168.10.3 TCP_TUNNEL/200 1001803 CONNECT www.google.com:443 - HIER_DIRECT/www.google.com - 192.168.100.3
1569935790.563 1010 192.168.10.3 TCP_TUNNEL/200 1001085 CONNECT www.google.com:443 - HIER_DIRECT/www.google.com - 192.168.100.3
1569935790.563 1010 192.168.10.3 TCP_TUNNEL/200 1000182 CONNECT www.google.com:443 - HIER_DIRECT/www.google.com - 192.168.100.3
1569935790.563 1010 192.168.10.3 TCP_TUNNEL/200 1006183 CONNECT www.xyz.com/index.php - HIER_DIRECT/www.xyz.com - 192.168.100.3
1569935790.563 1010 192.168.10.3 TCP_TUNNEL/200 1091083 CONNECT www.xyz.com/index.php - HIER_DIRECT/www.xyz.com - 192.168.100.3
1569935790.563 1010 192.168.10.3 TCP_TUNNEL/200 2091803 CONNECT www.xyz.com/index.php - HIER_DIRECT/www.xyz.com - 192.168.100.3
1569935790.563 1010 192.168.10.3 TCP_TUNNEL/200 2091083 CONNECT www.xyz.com/index.php - HIER_DIRECT/www.xyz.com - 192.168.100.3
59375 192.168.10.3 TAG_NONE/503 10 CONNECT www.google.com - HIER_NONE/- - 192.168.100.3
Output should be in the format:
Domain Bandwidth (MB) Hit (Count)
www.xyz.com 11 4
www.google.com 5 3
Upvotes: 0
Views: 113
Reputation: 141
import os
import re
from collections import defaultdict, Counter
import string
# Compile Regex pattern beforehand for optimized computation
domain_pattern = re.compile("(CONNECT )(?P<domain>.*?)( -)")
# Initialize a defaultdict for Storing and Updating the Sum of Bandwidths
bandwidths = defaultdict(int)
# Initialize a Counter for Storing and Updating the Count of Hits
counts = Counter()
path = ["/var/logs/"]
for i in path:
for filename in os.listdir(i):
with open(os.path.join(i, filename), 'r') as filedata:
merged_logs += filedata.readlines()
for line in merged_logs:
line_split = line.split(" ")
# Use re.search function to get the string matching the Regex Pattern
# Use group method to just fetch the named group: 'domain' as specified in the pattern
domain_str = re.search(domain_pattern, line).group('domain')
domain = domain_str.split("/")[0]
if len(line_split) == 10:
bandwidth = line_split[3]
elif len(line_split) == 11:
bandwidth = line_split[4]
else:
pass
# Update the defaultdict to add the bandwidth
bandwidths[domain] += int(bandwidth)
# Update the Counter to increment the count by 1
counts[domain] += 1
for domain in bandwidths:
bandwidth = int((bandwidths[domain] * 2 ) / (1024 * 1024))
hits = counts[domain]
print domain, bandwidth, hits
I ran the above code on the sample logs, to get the following output:
www.google.com:443 5 3
www.xyz.com 11 4
www.google.com 0 1
Upvotes: 1