Sprachuntersuchungen mit Python
grep
Alle Zeilen ausgeben, die eine bestimmte Eigenschaft (hier »>>> « )haben.
main.py
source = '''
>>> 0xb0
176
>>> 0xaf
175
>>> 0xFF
255
'''[1:][:-1]for line in source.split( '\n' ):
if '>>> ' in line:
print( line )- Protokoll
>>> 0xb0
>>> 0xaf
>>> 0xFFmain.py
import re
source = '''
>>> 0xb0
176
>>> 0xaf
175
>>> 0xFF
255
'''[1:][:-1]for line in source.split( '\n' ):
if re.search( '>>> ', line ):
print( line )- Protokoll
>>> 0xb0
>>> 0xaf
>>> 0xFFmain.py
import re
source = '''
>>> 0xb0
176
>>> 0xaf
175
>>> 0xFF
255
'''[1:][:-1]for line in source.split( '\n' ):
if re.match( '^.*>>> .*$', line ):
print( line )- Protokoll
>>> 0xb0
>>> 0xaf
>>> 0xFF
Geteiltes markiertes Vokabular
Welche markierten Wörter eines Textes, findet man auch in einem anderen Text?
main.py
import re
text0= '''
Kennt jemand einen Photographen?
'''[1:][:-1]text1='''
Hallo, ich suche einen Photographen!
'''[1:][:-1]stop = { }
def words( text ):
words = set()
for word in re.finditer( r'[^ :,.!?]+', text0 ):
words.add( word.group( 0 ))
return words
print( words( text0 ).intersection( words( text1 )).difference( stop ))
Häufigkeitszählung
Welche Kurse wurden wie oft angekündigt? Gegeben: Eine Liste der Kurse mit einer Ankündigung pro Zeile.
main.py
from collections import Counter
import operatorlines = '''
SQL 1
Python 1
SQL 1
Java 1
Python 1
Java 1
SQL 1
C++ 1
Python 1
Java 1
Java 1
Python 1
Java 1
SQL 1
C 1
VBA 1
Python 1
JavaScript 1
Java 2
Python 1
JavaScript 1
Python 1
SQL 1
Python 1
Python 1
Python 1
Python 1
Java 1
Java 2
'''[1:][:-1]Kurs = Counter()
for line in lines.split( '\n' ):
Kurs[ line ]+= 1for( i, j )in sorted( Kurs.items(), key=operator.itemgetter( 1 ), reverse=True ):
print( f"{j:2d} - {i}" )- Protokoll
11 - Python 1
6 - Java 1
5 - SQL 1
2 - JavaScript 1
2 - Java 2
1 - C++ 1
1 - C 1
1 - VBA 1
main.py
import re
import math
import operatorclass scanner_class:
def __init__( self, source ):
self.source = source
self.position = 0
def check( self, set ):
if not self.position < len( self.source ): return None
next = self.source[ self.position ]
if next in set:
self.position += 1
return next
return None
def numeral( self ):
p = self.position
while self.position < len( self.source )and self.source[ self.position ] in "0123456789":
self.position += 1
return self.source[ p: self.position ]ex = { '^' : operator.pow, '*': operator.mul, '/': operator.truediv,
'+': operator.add, '-': operator.sub }left_associative = { '^' : 0, '*': 1, '/': 1, '+': 1, '-': 1 }
class parser_class:
def __init__( self, source ):
self.scanner = scanner_class( source )
def numeral( self ):
return self.scanner.numeral()
def primary( self ):
return float( self.numeral() )
def binop( self, op, next ):
result = next()
while sym := self.scanner.check( op ):
result = ex[ sym ]( result, ( next() if left_associative[ sym ] else self.binop( op, next )))
return result
def power( self ): return self.binop( "^", self.primary )
def product( self ): return self.binop( "*/", self.power )
def sum( self ): return self.binop( "+-", self.product )
def start( self ): return self.sum()def evl( expr ):
return parser_class( expr ).start()def check( expr, value ):
v = evl( expr )
w = float( value )
print( v, w, v == w )check( "0", "0" )
check( "1", "1" )
check( "11", "11" )
check( "1+1", "2" )
check( "11+1", "12" )
check( "3-2", "1" )
check( "3*2", "6" )
check( "2^3", "8" )
check( "2^3-1", "7" )
check( "2^3/2", "4" )
check( "2^3^2", "512" )
check( "6+3*2^3+1", "31" )
- transcript
0.0 0.0 True
1.0 1.0 True
11.0 11.0 True
2.0 2.0 True
12.0 12.0 True
1.0 1.0 True
6.0 6.0 True
8.0 8.0 True
7.0 7.0 True
4.0 4.0 True
512.0 512.0 True
31.0 31.0 True
Treffer mit Umgebung (keywords in context, kwic )
main.py
import pathlib
import re
def process( file ):
string = r"[Tt]hank you [a-z]+ much"#
with file.open( encoding="cp1252" )as file:
for snippet in re.finditer(r".{100,100}"+string+".{100,100}",file.read(),re.DOTALL|re.IGNORECASE):
text = snippet.group(0).replace( "\n", " " ).replace( "<br>", " " ).replace( " ", " " ).replace( " ", " " )
for snippet in re.finditer(r".{20,20}"+string+".{20,20}",text,re.DOTALL|re.IGNORECASE):
print( str( snippet.group(0) ) )
files = [ pathlib.Path( r'example.txt' )]
tuple( map( process, files ))