{ "cells": [ { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\foersterflorian\\AppData\\Local\\Temp\\ipykernel_22040\\126296594.py:3: DeprecationWarning: \n", "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", "but was not found to be installed on your system.\n", "If this would cause problems for you,\n", "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", " \n", " import pandas as pd\n" ] } ], "source": [ "from symspellpy import SymSpell, Verbosity\n", "from itertools import islice\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "sym_spell = SymSpell()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "DICT_PATH = './de_DE.txt'\n", "DICT_PATH_2 = './deu_news_2023_1M-words.txt'" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(DICT_PATH_2, sep='\t', header=None)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
12
0
1!13183
2\\t99542\\n4\\t$\\t244\\n5\\t%\\t1310\\n6\\t&\\t1970\\n7\\...10108
250ins10012
251da9868
252sagt9634
.........
759746übergeben1
759747überhaupt1
759748überwältigt1
759749üppige1
759750ausweiten1
\n", "

759503 rows × 2 columns

\n", "
" ], "text/plain": [ " 1 2\n", "0 \n", "1 ! 13183\n", "2 \\t99542\\n4\\t$\\t244\\n5\\t%\\t1310\\n6\\t&\\t1970\\n7\\... 10108\n", "250 ins 10012\n", "251 da 9868\n", "252 sagt 9634\n", "... ... ...\n", "759746 übergeben 1\n", "759747 überhaupt 1\n", "759748 überwältigt 1\n", "759749 üppige 1\n", "759750 ausweiten 1\n", "\n", "[759503 rows x 2 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "ename": "UnicodeDecodeError", "evalue": "'charmap' codec can't decode byte 0x9d in position 918: character maps to ", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[37], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#sym_spell.load_dictionary(DICT_PATH, 0, 1)\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43msym_spell\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_dictionary\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDICT_PATH_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\site-packages\\symspellpy\\symspellpy.py:346\u001b[0m, in \u001b[0;36mSymSpell.load_dictionary\u001b[1;34m(self, corpus, term_index, count_index, separator, encoding)\u001b[0m\n\u001b[0;32m 344\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m 345\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(corpus, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m, encoding\u001b[38;5;241m=\u001b[39mencoding) \u001b[38;5;28;01mas\u001b[39;00m infile:\n\u001b[1;32m--> 346\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_dictionary_stream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 347\u001b[0m \u001b[43m \u001b[49m\u001b[43minfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mterm_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcount_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseparator\u001b[49m\n\u001b[0;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\site-packages\\symspellpy\\symspellpy.py:1122\u001b[0m, in \u001b[0;36mSymSpell._load_dictionary_stream\u001b[1;34m(self, corpus_stream, term_index, count_index, separator)\u001b[0m\n\u001b[0;32m 1101\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_dictionary_stream\u001b[39m(\n\u001b[0;32m 1102\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 1103\u001b[0m corpus_stream: IO[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1106\u001b[0m separator: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 1107\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n\u001b[0;32m 1108\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Loads multiple dictionary entries from a stream of word/frequency\u001b[39;00m\n\u001b[0;32m 1109\u001b[0m \u001b[38;5;124;03m count pairs.\u001b[39;00m\n\u001b[0;32m 1110\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1120\u001b[0m \u001b[38;5;124;03m ``True`` after file object is loaded.\u001b[39;00m\n\u001b[0;32m 1121\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1122\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m corpus_stream:\n\u001b[0;32m 1123\u001b[0m parts \u001b[38;5;241m=\u001b[39m line\u001b[38;5;241m.\u001b[39mrstrip()\u001b[38;5;241m.\u001b[39msplit(separator)\n\u001b[0;32m 1124\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(parts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n", "File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\encodings\\cp1252.py:23\u001b[0m, in \u001b[0;36mIncrementalDecoder.decode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, final\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m---> 23\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcodecs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcharmap_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43mdecoding_table\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n", "\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x9d in position 918: character maps to " ] } ], "source": [ "#sym_spell.load_dictionary(DICT_PATH, 0, 1)\n", "sym_spell.load_dictionary(DICT_PATH_2, 0, 1)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('der', 2061271019), ('die', 1905386032), ('und', 1438445733), ('in', 1023377139), ('den', 612197692)]\n" ] } ], "source": [ "print(list(islice(sym_spell.words.items(), 5)))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mitgliedschaft, 1, 827378\n" ] } ], "source": [ "# lookup suggestions for single-word input strings\n", "input_term = \"Mitglidschaft\" # misspelling of \"members\"\n", "# max edit distance per lookup\n", "# (max_edit_distance_lookup <= max_dictionary_edit_distance)\n", "suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, \n", " max_edit_distance=2, transfer_casing=True)\n", "# display suggestion term, edit distance, and term frequency\n", "for suggestion in suggestions:\n", " print(suggestion)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "teil ihre kontrolle der wasseraufberitungsanlagenwöchentliche sich kontrolle reinigung, 14, 0\n" ] } ], "source": [ "# lookup suggestions for multi-word input strings (supports compound\n", "# splitting & merging)\n", "input_term = (\n", " \"Täglihce Kontolle der Wasseraufberitungsanlagen\"\n", " \"Wöchentliche Sichtkontrolle / Reinigung\"\n", ")\n", "# max edit distance per lookup (per single word, not per whole input string)\n", "suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)\n", "# display suggestion term, edit distance, and term frequency\n", "for suggestion in suggestions:\n", " print(suggestion)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(suggestions)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'teil ihre kontrolle der wasseraufberitungsanlagenwöchentliche sich kontrolle reinigung'" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "suggestions[0].term" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }