Web-Scraping in Python
Vorheriger Abruf der »robots.txt« erlaubt es zu erkennen, ob automatische Abrufe gestattet sind.
Absatzfilter (2021-10)
main.py
import re
import urllib.request
import ssl
import subprocess
fn = '''output-file-20211018-34984298982238-tmpdml.html'''
output = open( fn, "w", errors='ignore' )
uri = fr'''http://www.example.com/'''
request = urllib.request.Request( uri )
resource = urllib.request.urlopen( request )
cs = resource.headers.get_content_charset()
content = resource.read().decode( cs, errors="ignore" )
for p in re.finditer( r'''<p[^\001]*?</p>''', content, flags=re.DOTALL ):
print( p.group( 0 ), file=output )
output.close()
subprocess.Popen( fn, shell=True )
Mit urllib
main.py
import re
import urllib.request
import ssl
from urllib.parse import unquote
def setup_a_direct_connection_to_the_internet():
"""this impedes the use of any proxy"""
urllib.request.getproxies = lambda: {}
urllib.request.proxy_bypass = lambda host, proxies=None: 1
def fetch_resource( URI, headers, context ):
"""get a web resource for a given URI, headers, and context"""
request = urllib.request.Request( URI, headers=headers )
resource = urllib.request.urlopen( request, context=context )
return resource
def get_resource( URI ):
"""get a web resource URI"""
context = ssl._create_unverified_context()
return fetch_resource \
( URI = URI,
headers = \
{ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
'Accept': '*/*',
'Accept-Language': 'en-US,de-DE',
'Accept-Encoding': 'UTF-8',
'Referer': URI,
},
context = context )
setup_a_direct_connection_to_the_internet()
uri = fr"http://example.com"
resource = get_resource( uri )
cs = resource.headers.get_content_charset()
content = resource.read().decode( cs )
print( content )
Mit Sockets
main.py
import socket
socket = socket.socket()
socket.connect( ( 'www.met.fu-berlin.de', 80 ))
socket.sendall( b'GET /wetter/mvdtext/ypage.txt HTTP/1.1\r\nHOST: www.met.fu-berlin.de\r\n\r\n' )
data = socket.recv( 32768 )
data = data.decode( 'iso8859-1' )
pos = data.find( "Die Temperatur betr" )+ 23
end = pos
while True:
x = data[ end ]
if x == '.' or x == ' ' or '0' <= x <= '9': end += 1
else: break
print( data[ pos : end ].strip() )- Protokoll
29.7
Downloader (simple spider)
Vorsicht! Überschreibt Dateien!
main.py
import re
import urllib.request
import ssl
from urllib.parse import unquote
def fetch_source( URI, headers, context ):
request = urllib.request.Request( URI, headers=headers )
resource = urllib.request.urlopen( request, context=context )
return resource
def get_source( URI ):
context = ssl._create_unverified_context()
return fetch_source \
( URI = URI,
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' },
context = context )
resource = get_source( INSERT SOURCE URI HERE )
cs = resource.headers.get_content_charset()
content = resource.read().decode( cs )
s=content
pattern = "(http[0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.!/()=?`*;:_{}\[\]\\|~%^-]*?jpg)"
pattern = 'href="/([^"]+:[^"]+)"'
pattern = INSERT URI PATTERN HERE
it = re.finditer( pattern, s )
seen = set()
uri = list()
for m in it:
u = m.groups()[ 0 ]
if not u in seen:
uri.append( INSERT URI PREFIX HERE + u )
seen.add( u )
def filename( s ):
s = s.replace( '<', '<' )
s = s.replace( '>', '>' )
s = s.replace( ':', ':' )
s = s.replace( '"', '"' )
s = s.replace( '/', '/' )
s = s.replace( '\\', '\' )
s = s.replace( '|', '|' )
s = s.replace( '?', '?' )
s = s.replace( '*', '*' )
return s
for u in uri:
v = unquote( u, encoding='utf-8' ) # CHANGE THIS
print( v )
try:
resource = get_source( u )
with open( filename( v ) + ".html", 'wb' )as f:
f.write( resource.read() )
except urllib.error.HTTPError as error:
print( error )- Protokoll
"utf-8"
"Content-type"
"text/html; charset=utf-8"
"viewport"
"width=device-width, initial-scale=1"
"text/css"
"Open Sans"
"Helvetica Neue"
"http://www.iana.org/domains/example"
Etwaige extrahierte URIs aus Attributen müssen eventuell noch mit __import__('html').unescape(URI) dekodiert werden, danach ist es eventuell auch noch nötig, bestimmten Zeichen (z.B. Leerzeichen) mit % zu kodieren (%20).