lang-main/misc/spellcheck/symspell.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\foersterflorian\\AppData\\Local\\Temp\\ipykernel_22040\\126296594.py:3: DeprecationWarning: \n",
      "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
      "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
      "but was not found to be installed on your system.\n",
      "If this would cause problems for you,\n",
      "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
      "        \n",
      "  import pandas as pd\n"
     ]
    }
   ],
   "source": [
    "from symspellpy import SymSpell, Verbosity\n",
    "from itertools import islice\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "sym_spell = SymSpell()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "DICT_PATH = './de_DE.txt'\n",
    "DICT_PATH_2 = './deu_news_2023_1M-words.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(DICT_PATH_2, sep='\t', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>!</td>\n",
       "      <td>13183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\t99542\\n4\\t$\\t244\\n5\\t%\\t1310\\n6\\t&amp;\\t1970\\n7\\...</td>\n",
       "      <td>10108</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>250</th>\n",
       "      <td>ins</td>\n",
       "      <td>10012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>251</th>\n",
       "      <td>da</td>\n",
       "      <td>9868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>252</th>\n",
       "      <td>sagt</td>\n",
       "      <td>9634</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>759746</th>\n",
       "      <td>übergeben</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>759747</th>\n",
       "      <td>überhaupt</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>759748</th>\n",
       "      <td>überwältigt</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>759749</th>\n",
       "      <td>üppige</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>759750</th>\n",
       "      <td>ausweiten</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>759503 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                        1      2\n",
       "0                                                               \n",
       "1                                                       !  13183\n",
       "2       \\t99542\\n4\\t$\\t244\\n5\\t%\\t1310\\n6\\t&\\t1970\\n7\\...  10108\n",
       "250                                                   ins  10012\n",
       "251                                                    da   9868\n",
       "252                                                  sagt   9634\n",
       "...                                                   ...    ...\n",
       "759746                                         übergeben      1\n",
       "759747                                         überhaupt      1\n",
       "759748                                       überwältigt      1\n",
       "759749                                            üppige      1\n",
       "759750                                        ausweiten      1\n",
       "\n",
       "[759503 rows x 2 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.set_index(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "ename": "UnicodeDecodeError",
     "evalue": "'charmap' codec can't decode byte 0x9d in position 918: character maps to <undefined>",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mUnicodeDecodeError\u001b[0m                        Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[37], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m#sym_spell.load_dictionary(DICT_PATH, 0, 1)\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43msym_spell\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_dictionary\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDICT_PATH_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\site-packages\\symspellpy\\symspellpy.py:346\u001b[0m, in \u001b[0;36mSymSpell.load_dictionary\u001b[1;34m(self, corpus, term_index, count_index, separator, encoding)\u001b[0m\n\u001b[0;32m    344\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m    345\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(corpus, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m, encoding\u001b[38;5;241m=\u001b[39mencoding) \u001b[38;5;28;01mas\u001b[39;00m infile:\n\u001b[1;32m--> 346\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_dictionary_stream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    347\u001b[0m \u001b[43m        \u001b[49m\u001b[43minfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mterm_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcount_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseparator\u001b[49m\n\u001b[0;32m    348\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\site-packages\\symspellpy\\symspellpy.py:1122\u001b[0m, in \u001b[0;36mSymSpell._load_dictionary_stream\u001b[1;34m(self, corpus_stream, term_index, count_index, separator)\u001b[0m\n\u001b[0;32m   1101\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_dictionary_stream\u001b[39m(\n\u001b[0;32m   1102\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   1103\u001b[0m     corpus_stream: IO[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1106\u001b[0m     separator: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m   1107\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n\u001b[0;32m   1108\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Loads multiple dictionary entries from a stream of word/frequency\u001b[39;00m\n\u001b[0;32m   1109\u001b[0m \u001b[38;5;124;03m    count pairs.\u001b[39;00m\n\u001b[0;32m   1110\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1120\u001b[0m \u001b[38;5;124;03m        ``True`` after file object is loaded.\u001b[39;00m\n\u001b[0;32m   1121\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1122\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m corpus_stream:\n\u001b[0;32m   1123\u001b[0m         parts \u001b[38;5;241m=\u001b[39m line\u001b[38;5;241m.\u001b[39mrstrip()\u001b[38;5;241m.\u001b[39msplit(separator)\n\u001b[0;32m   1124\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(parts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n",
      "File \u001b[1;32mc:\\Users\\foersterflorian\\mambaforge\\envs\\spellcheck\\lib\\encodings\\cp1252.py:23\u001b[0m, in \u001b[0;36mIncrementalDecoder.decode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m     22\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, final\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m---> 23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcodecs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcharmap_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43mdecoding_table\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
      "\u001b[1;31mUnicodeDecodeError\u001b[0m: 'charmap' codec can't decode byte 0x9d in position 918: character maps to <undefined>"
     ]
    }
   ],
   "source": [
    "#sym_spell.load_dictionary(DICT_PATH, 0, 1)\n",
    "sym_spell.load_dictionary(DICT_PATH_2, 0, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('der', 2061271019), ('die', 1905386032), ('und', 1438445733), ('in', 1023377139), ('den', 612197692)]\n"
     ]
    }
   ],
   "source": [
    "print(list(islice(sym_spell.words.items(), 5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mitgliedschaft, 1, 827378\n"
     ]
    }
   ],
   "source": [
    "# lookup suggestions for single-word input strings\n",
    "input_term = \"Mitglidschaft\"  # misspelling of \"members\"\n",
    "# max edit distance per lookup\n",
    "# (max_edit_distance_lookup <= max_dictionary_edit_distance)\n",
    "suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, \n",
    "                               max_edit_distance=2, transfer_casing=True)\n",
    "# display suggestion term, edit distance, and term frequency\n",
    "for suggestion in suggestions:\n",
    "    print(suggestion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "teil ihre kontrolle der wasseraufberitungsanlagenwöchentliche sich kontrolle reinigung, 14, 0\n"
     ]
    }
   ],
   "source": [
    "# lookup suggestions for multi-word input strings (supports compound\n",
    "# splitting & merging)\n",
    "input_term = (\n",
    "    \"Täglihce Kontolle der Wasseraufberitungsanlagen\"\n",
    "    \"Wöchentliche Sichtkontrolle / Reinigung\"\n",
    ")\n",
    "# max edit distance per lookup (per single word, not per whole input string)\n",
    "suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)\n",
    "# display suggestion term, edit distance, and term frequency\n",
    "for suggestion in suggestions:\n",
    "    print(suggestion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(suggestions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'teil ihre kontrolle der wasseraufberitungsanlagenwöchentliche sich kontrolle reinigung'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "suggestions[0].term"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}