351 lines
14 KiB
Plaintext
351 lines
14 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\foersterflorian\\AppData\\Local\\Temp\\ipykernel_22040\\126296594.py:3: DeprecationWarning: \n",
|
||
"Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
|
||
"(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
|
||
"but was not found to be installed on your system.\n",
|
||
"If this would cause problems for you,\n",
|
||
"please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
|
||
" \n",
|
||
" import pandas as pd\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from symspellpy import SymSpell, Verbosity\n",
|
||
"from itertools import islice\n",
|
||
"import pandas as pd"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sym_spell = SymSpell()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"DICT_PATH = './de_DE.txt'\n",
|
||
"DICT_PATH_2 = './deu_news_2023_1M-words.txt'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = pd.read_csv(DICT_PATH_2, sep='\t', header=None)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>1</th>\n",
|
||
" <th>2</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>!</td>\n",
|
||
" <td>13183</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>\\t99542\\n4\\t$\\t244\\n5\\t%\\t1310\\n6\\t&\\t1970\\n7\\...</td>\n",
|
||
" <td>10108</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>250</th>\n",
|
||
" <td>ins</td>\n",
|
||
" <td>10012</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>251</th>\n",
|
||
" <td>da</td>\n",
|
||
" <td>9868</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>252</th>\n",
|
||
" <td>sagt</td>\n",
|
||
" <td>9634</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>759746</th>\n",
|
||
" <td>übergeben</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>759747</th>\n",
|
||
" <td>überhaupt</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>759748</th>\n",
|
||
" <td>überwältigt</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>759749</th>\n",
|
||
" <td>üppige</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>759750</th>\n",
|
||
" <td>ausweiten</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>759503 rows × 2 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" 1 2\n",
|
||
"0 \n",
|
||
"1 ! 13183\n",
|
||
"2 \\t99542\\n4\\t$\\t244\\n5\\t%\\t1310\\n6\\t&\\t1970\\n7\\... 10108\n",
|
||
"250 ins 10012\n",
|
||
"251 da 9868\n",
|
||
"252 sagt 9634\n",
|
||
"... ... ...\n",
|
||
"759746 übergeben 1\n",
|
||
"759747 überhaupt 1\n",
|
||
"759748 überwältigt 1\n",
|
||
"759749 üppige 1\n",
|
||
"759750 ausweiten 1\n",
|
||
"\n",
|
||
"[759503 rows x 2 columns]"
|
||
]
|
||
},
|
||
"execution_count": 48,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.set_index(0)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "UnicodeDecodeError",
|
||
"evalue": "'charmap' codec can't decode byte 0x9d in position 918: character maps to <undefined>",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[1;32mIn[37], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#sym_spell.load_dictionary(DICT_PATH, 0, 1)\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43msym_spell\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_dictionary\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDICT_PATH_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\site-packages\\symspellpy\\symspellpy.py:346\u001b[0m, in \u001b[0;36mSymSpell.load_dictionary\u001b[1;34m(self, corpus, term_index, count_index, separator, encoding)\u001b[0m\n\u001b[0;32m 344\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 345\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(corpus, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m, encoding\u001b[38;5;241m=\u001b[39mencoding) \u001b[38;5;28;01mas\u001b[39;00m infile:\n\u001b[1;32m--> 346\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_dictionary_stream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 347\u001b[0m \u001b[43m \u001b[49m\u001b[43minfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mterm_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcount_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseparator\u001b[49m\n\u001b[0;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\site-packages\\symspellpy\\symspellpy.py:1122\u001b[0m, in \u001b[0;36mSymSpell._load_dictionary_stream\u001b[1;34m(self, corpus_stream, term_index, count_index, separator)\u001b[0m\n\u001b[0;32m 1101\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_dictionary_stream\u001b[39m(\n\u001b[0;32m 1102\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 1103\u001b[0m corpus_stream: IO[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1106\u001b[0m separator: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 1107\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n\u001b[0;32m 1108\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Loads multiple dictionary entries from a stream of word/frequency\u001b[39;00m\n\u001b[0;32m 1109\u001b[0m \u001b[38;5;124;03m count pairs.\u001b[39;00m\n\u001b[0;32m 1110\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1120\u001b[0m \u001b[38;5;124;03m ``True`` after file object is loaded.\u001b[39;00m\n\u001b[0;32m 1121\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1122\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m corpus_stream:\n\u001b[0;32m 1123\u001b[0m parts \u001b[38;5;241m=\u001b[39m line\u001b[38;5;241m.\u001b[39mrstrip()\u001b[38;5;241m.\u001b[39msplit(separator)\n\u001b[0;32m 1124\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(parts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n",
|
||
"File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\encodings\\cp1252.py:23\u001b[0m, in \u001b[0;36mIncrementalDecoder.decode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, final\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m---> 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcodecs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcharmap_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43mdecoding_table\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
|
||
"\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x9d in position 918: character maps to <undefined>"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#sym_spell.load_dictionary(DICT_PATH, 0, 1)\n",
|
||
"sym_spell.load_dictionary(DICT_PATH_2, 0, 1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[('der', 2061271019), ('die', 1905386032), ('und', 1438445733), ('in', 1023377139), ('den', 612197692)]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(list(islice(sym_spell.words.items(), 5)))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Mitgliedschaft, 1, 827378\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# lookup suggestions for single-word input strings\n",
|
||
"input_term = \"Mitglidschaft\" # misspelling of \"members\"\n",
|
||
"# max edit distance per lookup\n",
|
||
"# (max_edit_distance_lookup <= max_dictionary_edit_distance)\n",
|
||
"suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, \n",
|
||
" max_edit_distance=2, transfer_casing=True)\n",
|
||
"# display suggestion term, edit distance, and term frequency\n",
|
||
"for suggestion in suggestions:\n",
|
||
" print(suggestion)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"teil ihre kontrolle der wasseraufberitungsanlagenwöchentliche sich kontrolle reinigung, 14, 0\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# lookup suggestions for multi-word input strings (supports compound\n",
|
||
"# splitting & merging)\n",
|
||
"input_term = (\n",
|
||
" \"Täglihce Kontolle der Wasseraufberitungsanlagen\"\n",
|
||
" \"Wöchentliche Sichtkontrolle / Reinigung\"\n",
|
||
")\n",
|
||
"# max edit distance per lookup (per single word, not per whole input string)\n",
|
||
"suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)\n",
|
||
"# display suggestion term, edit distance, and term frequency\n",
|
||
"for suggestion in suggestions:\n",
|
||
" print(suggestion)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"1"
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(suggestions)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'teil ihre kontrolle der wasseraufberitungsanlagenwöchentliche sich kontrolle reinigung'"
|
||
]
|
||
},
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"suggestions[0].term"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|