import socket import ssl import urllib import re import os import sys # Or simply set this to True in TigerJython tigerJython = (sys.executable.find("jython")>=0) # Download einer Wikipedia-Seite # URL ist z.B. /wiki/Burg_Liechtenstein def download(url): host = "de.wikipedia.org" port = 443 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(0.5) s.connect((host , port)) s = ssl.wrap_socket(s) request = "GET " + url + " HTTP/1.1\r\nHost: " + host + "\r\n\r\n" if (tigerJython): s.sendall(request) else: s.sendall(bytes(request, 'utf-8')) try: reply = s.recv(4096) except socket.timeout: s.close() return "" start = reply.decode('utf-8').find("Content-Length: "); if (start==-1): start = reply.decode('utf-8').find("content-length: "); if (start>=0): end = reply.decode('utf-8').find("\r\n",start) #while reply[end]>='0' and reply[end]<='9': # end+=1 # print("Length: ->"+reply[(start+16):end]+"<-") numbytes = int(reply[(start+16):end]) print("About to get %d bytes" % numbytes); while reply.decode('utf-8').find("\r\n\r\n")<0: # print("About to get next chunk...") more = s.recv(4096) # print("Got %d more bytes" % len(more)) reply += more reply = reply[(reply.decode('utf-8').find("\r\n\r\n")+4):] # print("Reply so far: "+reply) while len(reply).*?', "", html) text = re.sub('<.*?>', "", text) ww = [w.upper() for w in text.split() if re.match('^[A-Z]{4,10}$',w.upper())] # print("Have %d words" % len(ww)) newwords = 0 for w in ww: if w in words: words[w]+=1 else: newwords+=1 words[w] = 1 print("Have %d new words" % newwords) print("Open pages:") print(todo[0:10]) alle = words.keys() # print("\n".join(alle)) for l in range(4,11): wl = [w for w in alle if len(w)==l] wl.sort() # print("\n".join(wl)) with open("wortliste%02d.txt" % l, "w") as file: for w in wl: file.write("%s %d\n" % (w, words[w]))