Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4,000 changes: 4,000 additions & 0 deletions raw_data/April2016.CSV

Large diffs are not rendered by default.

3,736 changes: 3,736 additions & 0 deletions raw_data/April2018.CSV

Large diffs are not rendered by default.

4,983 changes: 4,983 additions & 0 deletions raw_data/August2016.CSV

Large diffs are not rendered by default.

4,504 changes: 4,504 additions & 0 deletions raw_data/August2017.CSV

Large diffs are not rendered by default.

4,403 changes: 4,403 additions & 0 deletions raw_data/August2018.CSV

Large diffs are not rendered by default.

3,675 changes: 3,675 additions & 0 deletions raw_data/December2015.CSV

Large diffs are not rendered by default.

3,880 changes: 3,880 additions & 0 deletions raw_data/December2016.CSV

Large diffs are not rendered by default.

3,764 changes: 3,764 additions & 0 deletions raw_data/December2017.CSV

Large diffs are not rendered by default.

3,409 changes: 3,409 additions & 0 deletions raw_data/February2016.CSV

Large diffs are not rendered by default.

3,186 changes: 3,186 additions & 0 deletions raw_data/February2018.CSV

Large diffs are not rendered by default.

3,901 changes: 3,901 additions & 0 deletions raw_data/January2016.CSV

Large diffs are not rendered by default.

3,826 changes: 3,826 additions & 0 deletions raw_data/January2018.CSV

Large diffs are not rendered by default.

4,555 changes: 4,555 additions & 0 deletions raw_data/July2016.CSV

Large diffs are not rendered by default.

4,464 changes: 4,464 additions & 0 deletions raw_data/July2017.CSV

Large diffs are not rendered by default.

4,258 changes: 4,258 additions & 0 deletions raw_data/July2018.CSV

Large diffs are not rendered by default.

4,340 changes: 4,340 additions & 0 deletions raw_data/June2016.CSV

Large diffs are not rendered by default.

4,211 changes: 4,211 additions & 0 deletions raw_data/June2017.CSV

Large diffs are not rendered by default.

4,283 changes: 4,283 additions & 0 deletions raw_data/June2018.CSV

Large diffs are not rendered by default.

4,012 changes: 4,012 additions & 0 deletions raw_data/March2016.CSV

Large diffs are not rendered by default.

3,630 changes: 3,630 additions & 0 deletions raw_data/March2018.CSV

Large diffs are not rendered by default.

4,431 changes: 4,431 additions & 0 deletions raw_data/May2016.CSV

Large diffs are not rendered by default.

4,014 changes: 4,014 additions & 0 deletions raw_data/May2018.CSV

Large diffs are not rendered by default.

3,953 changes: 3,953 additions & 0 deletions raw_data/November2016.CSV

Large diffs are not rendered by default.

3,848 changes: 3,848 additions & 0 deletions raw_data/November2017.CSV

Large diffs are not rendered by default.

4,587 changes: 4,587 additions & 0 deletions raw_data/October2016.CSV

Large diffs are not rendered by default.

4,463 changes: 4,463 additions & 0 deletions raw_data/October2017.CSV

Large diffs are not rendered by default.

4,088 changes: 4,088 additions & 0 deletions raw_data/October2018.CSV

Large diffs are not rendered by default.

4,388 changes: 4,388 additions & 0 deletions raw_data/September2016.CSV

Large diffs are not rendered by default.

3,757 changes: 3,757 additions & 0 deletions raw_data/September2017.CSV

Large diffs are not rendered by default.

4,097 changes: 4,097 additions & 0 deletions raw_data/September2018.CSV

Large diffs are not rendered by default.

92 changes: 92 additions & 0 deletions scraping_stl_crimedata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
'''
This is the same code from the iPython Notebook version of 'scraping_stl_crimedata.'

Must install prereqs first:

requests
BeautifulSoup

'''

import requests
from bs4 import BeautifulSoup
import re
import os
import time

# URL
url = 'http://www.slmpd.org/CrimeReport.aspx'

# Path to save location
path = 'raw_data/'

def get_filename(headers):
"""Parses out the filename from a response header."""
return headers['content-disposition'].split('=')[1]

payload = {}

# The first page request is a get to the url.
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

# Get the three hidden parameter values.
payload_raw = soup.find_all('input')
payload = {x['name']:x['value'] for x in payload_raw}

# List to hold eventtargets.
datasets_eventtargets_raw = []

# Get the data for this page and store it.
links = soup.find_all(href=re.compile("javascript:__doPostBack\('.*D',''\)"))
datasets_eventtargets_raw.append((1, dict(payload), links))

# Set EventTarget for page requesting.
payload['__EVENTTARGET'] = 'GridView1'

# Loop through all pages.
for i in range(2,7):
# Set the eventargument value in the payload.
payload['__EVENTARGUMENT'] = 'Page$' + str(i)

# Request the page, make a soup object, get all relevant tags.
r = requests.post(url, data=payload)
soup = BeautifulSoup(r.content, "html.parser")
# Get the three hidden parameter values.
inputs_raw = soup.find_all('input')
inputs = {x['name']:x['value'] for x in inputs_raw}
links = soup.find_all(href=re.compile("javascript:__doPostBack\('.*D',''\)"))
datasets_eventtargets_raw.append((i, inputs, links))

# Get list of files that have already been downloaded.
file_list = set(os.listdir('raw_data/'))

# Loop through the list of tuples and use the payload dict from each tuple to call all the files from the list in that tuple.
pat = re.compile(r"\(\'(.+?)\'\)?")
for tup in datasets_eventtargets_raw:
# Parse out the argument value and filename (for validating responses).
datasets_eventtargets = [(pat.findall(x['href'])[0], x.text) for x in tup[2]]

# Get the three common arguments for all the files on this page.
payload = tup[1]

# Add a blank fourth.
payload['__EVENTARGUMENT'] = ''

# Loop through the parsed file arguments and request the files.
for t in datasets_eventtargets:
if t[1] not in file_list: # Check if this file has already been downloaded.
payload['__EVENTTARGET'] = t[0]
r = requests.post(url, data=payload)
if get_filename(r.headers) == t[1]:
# Save the file.
# TODO: Should rename the files so year is first so they sort correctly.
with open(os.path.join(path, get_filename(r.headers)), 'wb') as f:
f.write(r.content)
else:
print('Error with page: ' + str(tup[0]) + ', argument: ' + t[0])
time.sleep(5) #to avoid connection issues with the server
else:
print(t[1] + ' has already been downloaded.')