A collection for various scripts and other utilities useful in Biblical studies.

load-sblgnt.py 1.3KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. # load-sblgnt.py
  2. # Imports the SBLGNT from plain text and loads it into an NLTK corpus
  3. # (c) 2013 Nathan Smith <nathan@smithfam.info>
  4. #
  5. # This program is free software: you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation, either version 3 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. import codecs
  18. import nltk
  19. import os
  20. nt = ""
  21. texts = os.listdir('out/')
  22. for text in texts:
  23. f = codecs.open('out/' + text,encoding='utf-8')
  24. nt += f.read()
  25. f.close()
  26. # Need to avoid using default wordpunct tokenizer here, since it erroneously
  27. # splits elisions from tokens.
  28. # \u2019 is the character used by SBLGNT for elisions, so it is added
  29. nt_tokens = nltk.tokenize.regexp.regexp_tokenize(nt,
  30. u'\w+\u2019?|[^\w\s\u2019]+')
  31. sblgnt_text = nltk.text.Text([w.encode('utf-8') for w in nt_tokens])
  32. sblgnt_text.collocations()
  33. sblgnt_text.generate()