Commit a45d85d1 authored by fred's avatar fred

webstats: revamp to ignore partial downloads

parent a3f3ae48
from ipaddress import ip_address
import os
import re
import dateutil.parser
......@@ -10,42 +11,75 @@ from emissions.models import SoundFile
from panikweb.webstats.models import PodcastLogLine
class Sighting:
ip = None
path = None
user_agent = None
log_datetime = None
size = 0
stored = False
def __init__(self, ip, path, user_agent, log_datetime, size):
self.ip = ip
self.path = path
self.user_agent = user_agent
self.log_datetime = log_datetime
self.size = int(size)
def seen_key(self):
return (self.ip, self.path, self.user_agent)
class Command(BaseCommand):
def handle(self, *args, **kwargs):
verbose = bool(kwargs.get('verbosity') > 0)
regex = re.compile(
r'([(a-f\d\.\:)]+) - - \[(.*?)\] "GET /media/sounds/(.*?) HTTP/..." \d+ \d+ "(.*?)" "(.*?)"'
r'([(a-f\d\.\:)]+) - - \[(.*?)\] "GET /media/sounds/(.*?) HTTP/..." \d+ (\d+) "(.*?)" "(.*?)"'
)
seen = {}
with open(settings.ACCESS_LOG_FILENAME) as fd:
with open(settings.ACCESS_LOG_FILENAME, 'rt') as fd:
for line in fd:
match = regex.match(line)
if not match:
continue
ip, date, path, referrer, user_agent = match.groups()
ip, date, path, size, referrer, user_agent = match.groups()
if not (path.endswith('.ogg') or path.endswith('.mp3')):
continue
log_datetime = dateutil.parser.parse(date.replace(':', ' ', 1))
previous_sighting = seen.get((ip, path, user_agent))
if previous_sighting and (log_datetime - previous_sighting).total_seconds() < 3600:
# don't record requests that already happened less than an
# hour ago.
sighting = Sighting(ip, path, user_agent, log_datetime, size)
previous_sighting = seen.get(sighting.seen_key())
if previous_sighting:
if previous_sighting.stored:
# skip sighting that has already been stored
continue
# most likely partial downloads, add to previous
previous_sighting.size += sighting.size
sighting = previous_sighting
try:
real_size = os.stat(os.path.join(settings.MEDIA_ROOT, 'sounds', path)).st_size
except OSError:
# skip invalid file
continue
seen[sighting.seen_key()] = sighting
if sighting.size < real_size / 5:
# don't record under downloaded size threshold
continue
seen[(ip, path, user_agent)] = log_datetime
try:
soundfile = SoundFile.objects.get(id=path.split('_')[-3])
except SoundFile.DoesNotExist:
continue
PodcastLogLine.objects.update_or_create(
timestamp=log_datetime,
ip=self.anonymise_ip(ip),
path=path,
timestamp=sighting.log_datetime,
ip=self.anonymise_ip(sighting.ip),
path=sighting.path,
soundfile=soundfile,
user_agent=user_agent,
user_agent=sighting.user_agent,
defaults={
'referrer': referrer,
},
)
sighting.stored = True
def anonymise_ip(self, ip):
address = ip_address(ip)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment