lang-main/misc/spellcheck/symspell.ipynb
Florian Förster 9edcd5be4e initial commit
2024-05-08 14:46:43 +02:00

351 lines
14 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\foersterflorian\\AppData\\Local\\Temp\\ipykernel_22040\\126296594.py:3: DeprecationWarning: \n",
"Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
"(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
"but was not found to be installed on your system.\n",
"If this would cause problems for you,\n",
"please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
" \n",
" import pandas as pd\n"
]
}
],
"source": [
"from symspellpy import SymSpell, Verbosity\n",
"from itertools import islice\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"sym_spell = SymSpell()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"DICT_PATH = './de_DE.txt'\n",
"DICT_PATH_2 = './deu_news_2023_1M-words.txt'"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(DICT_PATH_2, sep='\t', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>!</td>\n",
" <td>13183</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>\\t99542\\n4\\t$\\t244\\n5\\t%\\t1310\\n6\\t&amp;\\t1970\\n7\\...</td>\n",
" <td>10108</td>\n",
" </tr>\n",
" <tr>\n",
" <th>250</th>\n",
" <td>ins</td>\n",
" <td>10012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>251</th>\n",
" <td>da</td>\n",
" <td>9868</td>\n",
" </tr>\n",
" <tr>\n",
" <th>252</th>\n",
" <td>sagt</td>\n",
" <td>9634</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>759746</th>\n",
" <td>übergeben</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>759747</th>\n",
" <td>überhaupt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>759748</th>\n",
" <td>überwältigt</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>759749</th>\n",
" <td>üppige</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>759750</th>\n",
" <td>ausweiten</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>759503 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" 1 2\n",
"0 \n",
"1 ! 13183\n",
"2 \\t99542\\n4\\t$\\t244\\n5\\t%\\t1310\\n6\\t&\\t1970\\n7\\... 10108\n",
"250 ins 10012\n",
"251 da 9868\n",
"252 sagt 9634\n",
"... ... ...\n",
"759746 übergeben 1\n",
"759747 überhaupt 1\n",
"759748 überwältigt 1\n",
"759749 üppige 1\n",
"759750 ausweiten 1\n",
"\n",
"[759503 rows x 2 columns]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.set_index(0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"ename": "UnicodeDecodeError",
"evalue": "'charmap' codec can't decode byte 0x9d in position 918: character maps to <undefined>",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[37], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#sym_spell.load_dictionary(DICT_PATH, 0, 1)\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43msym_spell\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_dictionary\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDICT_PATH_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\site-packages\\symspellpy\\symspellpy.py:346\u001b[0m, in \u001b[0;36mSymSpell.load_dictionary\u001b[1;34m(self, corpus, term_index, count_index, separator, encoding)\u001b[0m\n\u001b[0;32m 344\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 345\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(corpus, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m, encoding\u001b[38;5;241m=\u001b[39mencoding) \u001b[38;5;28;01mas\u001b[39;00m infile:\n\u001b[1;32m--> 346\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_dictionary_stream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 347\u001b[0m \u001b[43m \u001b[49m\u001b[43minfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mterm_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcount_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseparator\u001b[49m\n\u001b[0;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\site-packages\\symspellpy\\symspellpy.py:1122\u001b[0m, in \u001b[0;36mSymSpell._load_dictionary_stream\u001b[1;34m(self, corpus_stream, term_index, count_index, separator)\u001b[0m\n\u001b[0;32m 1101\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_dictionary_stream\u001b[39m(\n\u001b[0;32m 1102\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 1103\u001b[0m corpus_stream: IO[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1106\u001b[0m separator: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 1107\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n\u001b[0;32m 1108\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Loads multiple dictionary entries from a stream of word/frequency\u001b[39;00m\n\u001b[0;32m 1109\u001b[0m \u001b[38;5;124;03m count pairs.\u001b[39;00m\n\u001b[0;32m 1110\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1120\u001b[0m \u001b[38;5;124;03m ``True`` after file object is loaded.\u001b[39;00m\n\u001b[0;32m 1121\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1122\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m corpus_stream:\n\u001b[0;32m 1123\u001b[0m parts \u001b[38;5;241m=\u001b[39m line\u001b[38;5;241m.\u001b[39mrstrip()\u001b[38;5;241m.\u001b[39msplit(separator)\n\u001b[0;32m 1124\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(parts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n",
"File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\encodings\\cp1252.py:23\u001b[0m, in \u001b[0;36mIncrementalDecoder.decode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, final\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m---> 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcodecs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcharmap_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43mdecoding_table\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
"\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x9d in position 918: character maps to <undefined>"
]
}
],
"source": [
"#sym_spell.load_dictionary(DICT_PATH, 0, 1)\n",
"sym_spell.load_dictionary(DICT_PATH_2, 0, 1)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('der', 2061271019), ('die', 1905386032), ('und', 1438445733), ('in', 1023377139), ('den', 612197692)]\n"
]
}
],
"source": [
"print(list(islice(sym_spell.words.items(), 5)))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mitgliedschaft, 1, 827378\n"
]
}
],
"source": [
"# lookup suggestions for single-word input strings\n",
"input_term = \"Mitglidschaft\" # misspelling of \"members\"\n",
"# max edit distance per lookup\n",
"# (max_edit_distance_lookup <= max_dictionary_edit_distance)\n",
"suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, \n",
" max_edit_distance=2, transfer_casing=True)\n",
"# display suggestion term, edit distance, and term frequency\n",
"for suggestion in suggestions:\n",
" print(suggestion)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"teil ihre kontrolle der wasseraufberitungsanlagenwöchentliche sich kontrolle reinigung, 14, 0\n"
]
}
],
"source": [
"# lookup suggestions for multi-word input strings (supports compound\n",
"# splitting & merging)\n",
"input_term = (\n",
" \"Täglihce Kontolle der Wasseraufberitungsanlagen\"\n",
" \"Wöchentliche Sichtkontrolle / Reinigung\"\n",
")\n",
"# max edit distance per lookup (per single word, not per whole input string)\n",
"suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)\n",
"# display suggestion term, edit distance, and term frequency\n",
"for suggestion in suggestions:\n",
" print(suggestion)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(suggestions)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'teil ihre kontrolle der wasseraufberitungsanlagenwöchentliche sich kontrolle reinigung'"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"suggestions[0].term"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}