A collection for various scripts and other utilities useful in Biblical studies.

wordsofjesus.py 2.5KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. #! /usr/bin/env python
  2. #
  3. # Script to extract and rank all of Jesus words
  4. # (c) 2012 Nathan Smith <nathan@smithfam.info>
  5. #
  6. # Permission is hereby granted, free of charge, to any person obtaining a copy
  7. # of this software and associated documentation files (the "Software"), to deal
  8. # in the Software without restriction, including without limitation the rights
  9. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. # copies of the Software, and to permit persons to whom the Software is
  11. # furnished to do so, subject to the following conditions:
  12. #
  13. # The above copyright notice and this permission notice shall be included in
  14. # all copies or substantial portions of the Software.
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  21. # THE SOFTWARE.
  22. import os
  23. import StringIO
  24. import urllib
  25. import xml.sax
  26. import zipfile
  27. import nltk
  28. from nltk.corpus import stopwords
  29. stops = stopwords.words('english')
  30. url = 'http://ebible.org/web/eng-web_usfx.zip'
  31. class WEBParser(xml.sax.handler.ContentHandler):
  32. """Class to parse WEB XML and extract words of Jesus"""
  33. def __init__(self):
  34. self.in_wj = False
  35. self.words = ""
  36. def startElement(self,name, attrs):
  37. if name == "wj":
  38. self.in_wj = True
  39. def characters(self, data):
  40. if self.in_wj:
  41. self.words += data.lower()
  42. def endElement(self, name):
  43. if name == "wj":
  44. self.in_wj = False
  45. if __name__ == '__main__':
  46. if not os.path.exists(os.path.basename(url)):
  47. try:
  48. urllib.urlretrieve(url, os.path.basename(url))
  49. except:
  50. sys.exit(1)
  51. web_zip = os.path.basename(url)
  52. parser = xml.sax.make_parser()
  53. web = WEBParser()
  54. parser.setContentHandler(web)
  55. _zip = zipfile.ZipFile(web_zip)
  56. parser.parse(StringIO.StringIO(_zip.read("eng-web_usfx.xml")))
  57. words_of_jesus = nltk.word_tokenize(web.words)
  58. word_list = [w for w in words_of_jesus if (w not in stops and w.isalpha())]
  59. fdist = nltk.FreqDist(word_list)
  60. for word, count in fdist.items()[:100]:
  61. print "%s - %d" % (word, count)