Sprachuntersuchungen mit Python

grep

Alle Zeilen ausgeben, die eine bestimmte Eigenschaft (hier »>>> « )haben.

main.py

source = '''
>>> 0xb0
176
>>> 0xaf
175
>>> 0xFF
255
'''[1:][:-1]
for line in source.split( '\n' ):
    if '>>> ' in line:
        print( line )

Protokoll

>>> 0xb0
>>> 0xaf
>>> 0xFF

main.py

import re
source = '''
>>> 0xb0
176
>>> 0xaf
175
>>> 0xFF
255
'''[1:][:-1]
for line in source.split( '\n' ):
    if re.search( '>>> ', line ):
        print( line )

Protokoll

>>> 0xb0
>>> 0xaf
>>> 0xFF

main.py

import re
source = '''
>>> 0xb0
176
>>> 0xaf
175
>>> 0xFF
255
'''[1:][:-1]
for line in source.split( '\n' ):
    if re.match( '^.*>>> .*$', line ):
        print( line )

Protokoll

>>> 0xb0
>>> 0xaf
>>> 0xFF

Geteiltes markiertes Vokabular

Welche markierten Wörter eines Textes, findet man auch in einem anderen Text?

main.py

import re
text0= '''
Kennt jemand einen Photographen?
'''[1:][:-1]
text1='''
Hallo, ich suche einen Photographen!
'''[1:][:-1]
stop = { }

def words( text ):
    words = set()
    for word in re.finditer( r'[^ :,.!?]+', text0 ):
        words.add( word.group( 0 ))
    return words

print( words( text0 ).intersection( words( text1 )).difference( stop ))

Häufigkeitszählung

Welche Kurse wurden wie oft angekündigt? Gegeben: Eine Liste der Kurse mit einer Ankündigung pro Zeile.

main.py

from collections import Counter
import operator
lines = '''
SQL 1
Python 1
SQL 1
Java 1
Python 1
Java 1
SQL 1
C++ 1
Python 1
Java 1
Java 1
Python 1
Java 1
SQL 1
C 1
VBA 1
Python 1
JavaScript 1
Java 2
Python 1
JavaScript 1
Python 1
SQL 1
Python 1
Python 1
Python 1
Python 1
Java 1
Java 2
'''[1:][:-1]
Kurs = Counter()
for line in lines.split( '\n' ):
    Kurs[ line ]+= 1
for( i, j )in sorted( Kurs.items(), key=operator.itemgetter( 1 ), reverse=True ):
    print( f"{j:2d} - {i}" )

Protokoll

11 - Python 1

6 - Java 1

5 - SQL 1

2 - JavaScript 1

2 - Java 2

1 - C++ 1

1 - C 1

1 - VBA 1

main.py

import re
import math
import operator
class scanner_class:
    def __init__( self, source ):
        self.source = source
        self.position = 0
    def check( self, set ):
        if not self.position < len( self.source ): return None
        next = self.source[ self.position ]
        if next in set:
            self.position += 1
            return next
        return None
    def numeral( self ):
        p = self.position
        while self.position < len( self.source )and self.source[ self.position ] in "0123456789":
            self.position += 1
        return self.source[ p: self.position ]
ex = { '^' : operator.pow, '*': operator.mul, '/': operator.truediv,
    '+': operator.add, '-': operator.sub }
left_associative = { '^' : 0, '*': 1, '/': 1, '+': 1, '-': 1 }
class parser_class:
    def __init__( self, source ):
        self.scanner = scanner_class( source )
    def numeral( self ):
        return self.scanner.numeral()
    def primary( self ):
        return float( self.numeral() )
    def binop( self, op, next ):
        result = next()
        while sym := self.scanner.check( op ):
            result = ex[ sym ]( result, ( next() if left_associative[ sym ] else self.binop( op, next )))
        return result
    def power( self ): return self.binop( "^", self.primary )
    def product( self ): return self.binop( "*/", self.power )
    def sum( self ): return self.binop( "+-", self.product )
    def start( self ): return self.sum()
def evl( expr ):
    return parser_class( expr ).start()
def check( expr, value ):
    v = evl( expr )
    w = float( value )
    print( v, w, v == w )
check( "0", "0" )
check( "1", "1" )
check( "11", "11" )
check( "1+1", "2" )
check( "11+1", "12" )
check( "3-2", "1" )
check( "3*2", "6" )
check( "2^3", "8" )
check( "2^3-1", "7" )
check( "2^3/2", "4" )
check( "2^3^2", "512" )
check( "6+3*2^3+1", "31" )

transcript

0.0 0.0 True
1.0 1.0 True
11.0 11.0 True
2.0 2.0 True
12.0 12.0 True
1.0 1.0 True
6.0 6.0 True
8.0 8.0 True
7.0 7.0 True
4.0 4.0 True
512.0 512.0 True
31.0 31.0 True

Treffer mit Umgebung (keywords in context, kwic )

main.py

import pathlib
import re
def process( file ):
    string = r"[Tt]hank you [a-z]+ much"#
    with file.open( encoding="cp1252" )as file:
        for snippet in re.finditer(r".{100,100}"+string+".{100,100}",file.read(),re.DOTALL|re.IGNORECASE):
            text = snippet.group(0).replace( "\n", " " ).replace( "<br>", " " ).replace( "  ", " " ).replace( "  ", " " )
            for snippet in re.finditer(r".{20,20}"+string+".{20,20}",text,re.DOTALL|re.IGNORECASE):
                print( str( snippet.group(0) ) )
files = [ pathlib.Path( r'example.txt' )]
tuple( map( process, files ))