strongs-sqlite.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. #! /usr/bin/env python
  2. #
  3. # Strongs XML to sqlite3 converter
  4. # Copyright (c) 2011, 2012 Nathan Smith <nathan@smithfam.info>
  5. #
  6. # Permission is hereby granted, free of charge, to any person obtaining a copy
  7. # of this software and associated documentation files (the "Software"), to deal
  8. # in the Software without restriction, including without limitation the rights
  9. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. # copies of the Software, and to permit persons to whom the Software is
  11. # furnished to do so, subject to the following conditions:
  12. #
  13. # The above copyright notice and this permission notice shall be included in
  14. # all copies or substantial portions of the Software.
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  21. # THE SOFTWARE.
  22. import logging
  23. import os
  24. import sqlite3
  25. import StringIO
  26. import xml.sax
  27. import zipfile
  28. import urllib
  29. hebrew_source = 'https://github.com/openscriptures/strongs/raw/master/hebrew/StrongHebrewG.xml'
  30. greek_source = 'https://github.com/downloads/morphgnt/strongs-dictionary-xml/StrongsGreekDictionaryXML_1.5.zip'
  31. db_file = 'strongs.sqlite'
  32. log_file = 'strongs.log'
  33. def download(url):
  34. """Download the given URL and return a path."""
  35. if not os.path.exists(os.path.basename(url)):
  36. logging.info("Retrieving %s" % url)
  37. try:
  38. urllib.urlretrieve(url, os.path.basename(url))
  39. except:
  40. logging.error("Failed to retrieve resource.")
  41. sys.exit(1)
  42. else:
  43. logging.info("%s already exists" % os.path.basename(url))
  44. return os.path.basename(url)
  45. class StrongsDB():
  46. """Class to handle database access for Strongs import"""
  47. def __init__(self, db_file):
  48. """Initialize the database and instance vars."""
  49. self.reset_vars()
  50. self._conn = sqlite3.connect(db_file)
  51. self._cursor = self._conn.cursor()
  52. init_db_sql = "create table strongs (number text, lemma text, \
  53. xlit text, pronounce text, description text)"
  54. self._cursor.execute(init_db_sql)
  55. self._conn.commit()
  56. def reset_vars(self):
  57. """Reset instance variables between db operations.
  58. Should be called after batches of add_row, add_row_greek, and
  59. add_deriv.
  60. """
  61. self.number = ""
  62. self.lemma = ""
  63. self.xlit = ""
  64. self.pronounce = ""
  65. self.description = ""
  66. def add_row(self):
  67. """Add a full row into the database, used for Hebrew."""
  68. add_row_sql = 'insert into strongs values (?, ?, ?, ?, ?)'
  69. logging.debug("add_row_sql: %s|%s|%s|%s|%s" % (self.number,
  70. self.lemma, self.xlit, self.pronounce, self.description))
  71. self._cursor.execute(add_row_sql, (self.number, self.lemma, self.xlit,
  72. self.pronounce, self.description,))
  73. self.reset_vars()
  74. def add_row_greek(self):
  75. """Add a partial line, lacking derivation, for Greek."""
  76. arg_sql = "insert into strongs (number, lemma, xlit, pronounce) values\
  77. (?, ?, ?, ?)"
  78. logging.debug("add_row_sql: %s|%s|%s|%s" % (self.number,
  79. self.lemma, self.xlit, self.pronounce))
  80. self._cursor.execute(arg_sql, (self.number, self.lemma, self.xlit,
  81. self.pronounce,))
  82. self.reset_vars()
  83. def add_deriv(self):
  84. """Fill in the missing deriv field for Greek."""
  85. self.prepare_row()
  86. ad_sql = "update strongs set description = ? where number = ?"
  87. logging.debug("updating %s with description: %s" % (self.number,
  88. self.description))
  89. self._cursor.execute(ad_sql, (self.description, self.number,))
  90. self.reset_vars()
  91. def get_lemma(self, number):
  92. """Query the database for the given number and return the lemma.
  93. If the database does not have the lemma for that number, return the
  94. number back instead.
  95. """
  96. gl_sql = "select lemma from strongs where number=?"
  97. self._cursor.execute(gl_sql, (number,))
  98. res = self._cursor.fetchone()
  99. if not res:
  100. lemma = number
  101. else:
  102. lemma = res[0]
  103. return lemma
  104. def prepare_row(self):
  105. self.description = self.description.replace("\n","")
  106. def db_commit(self):
  107. """Commit changes to the database."""
  108. self._conn.commit()
  109. def finish(self):
  110. """Close the database connection."""
  111. self._cursor.close()
  112. class StrongsHebrewParser(xml.sax.handler.ContentHandler):
  113. """Class to parse the Strongs Hebrew xml file."""
  114. def __init__(self, db):
  115. self.in_foreign = False
  116. self.note_depth = 0
  117. self.in_entry = False
  118. self.in_trans = False
  119. self.db = db
  120. def startElement(self, name, attrs):
  121. """Actions for opening tags."""
  122. if name == "foreign":
  123. self.in_foreign = True
  124. if name == "note":
  125. self.note_depth +=1
  126. if attrs.getValue("type") == "translation":
  127. self.in_trans = True
  128. if name == "div" and attrs.getValue("type") == "entry":
  129. self.in_entry = True
  130. if name == "w" and self.in_foreign == False and self.note_depth == 0:
  131. self.db.number = attrs.getValue("ID")
  132. self.db.lemma = attrs.getValue("lemma")
  133. self.db.xlit = attrs.getValue("xlit")
  134. self.db.pronounce = attrs.getValue("POS")
  135. if name == "w" and self.note_depth > 0:
  136. if "lemma" in attrs.getNames():
  137. self.db.description += attrs.getValue("lemma")
  138. else:
  139. self.db.description += attrs.getValue("POS")
  140. def characters(self, data):
  141. """Actions for characters within tags"""
  142. if self.note_depth > 0:
  143. self.db.description += data
  144. def endElement(self, name):
  145. """Actions for closing tags."""
  146. if name == "foreign":
  147. self.in_foreign = False
  148. if name == "note":
  149. self.note_depth -=1
  150. # If we exit a note completely, close the note types
  151. if self.note_depth == 0 and not self.in_trans:
  152. # Add a space between entries when moving between notes
  153. if self.db.description and self.db.description[-1] == ";":
  154. self.db.description += " "
  155. else:
  156. self.db.description += "; "
  157. if name == "div" and self.in_entry == True:
  158. # Commit to db when each word's div tag is closed
  159. # Have to differentiate the type, since there is a div supertag.
  160. self.db.add_row()
  161. self.in_entry = False
  162. self.in_trans = False
  163. class StrongsGreekParser(xml.sax.handler.ContentHandler):
  164. """Class to parse the Strongs Greek xml file."""
  165. def __init__(self, db):
  166. self.in_entry = False
  167. self.in_strongs = False
  168. self.greek_tag = 0
  169. self.db = db
  170. def startElement(self, name, attrs):
  171. """Actions for opening tags."""
  172. if name == "entry":
  173. self.in_entry = True
  174. if name == "strongs":
  175. self.in_strongs = True
  176. if name == "greek":
  177. # Ignore <greek> tags after the first one.
  178. # Corrects a bug in G1
  179. self.greek_tag += 1
  180. if self.greek_tag == 1:
  181. self.db.lemma = attrs.getValue("unicode")
  182. self.db.xlit = attrs.getValue("translit")
  183. if name == "pronunciation":
  184. self.db.pronounce = attrs.getValue("strongs")
  185. def characters(self, data):
  186. """Actions for characters within tags"""
  187. if self.in_strongs:
  188. self.db.number = "G%s" % data
  189. def endElement(self, name):
  190. """Actions for closing tags."""
  191. if name == "entry":
  192. self.in_entry = False
  193. self.greek_tag = 0
  194. self.db.add_row_greek()
  195. if name == "strongs":
  196. self.in_strongs = False
  197. class StrongsG2Parser(xml.sax.handler.ContentHandler):
  198. """Class to fill in strongs_derivation in Greek from db."""
  199. def __init__(self, db):
  200. self.in_desc = False
  201. self.in_strongs = False
  202. self.db = db
  203. def startElement(self, name, attrs):
  204. """Actions for opening tags."""
  205. if name == "strongs_derivation":
  206. self.in_desc = True
  207. if name == "strongs":
  208. self.in_strongs = True
  209. if name == "strongsref":
  210. lang = attrs.getValue("language")
  211. num = attrs.getValue("strongs").lstrip("0")
  212. number = "%s%s" % (lang[0], num)
  213. logging.debug("Querying strongsref %s for entry %s" % (number,
  214. self.db.number))
  215. lemma = self.db.get_lemma(number)
  216. self.db.description += lemma
  217. def characters(self, data):
  218. """Actions for characters within tags"""
  219. if self.in_desc:
  220. self.db.description += data
  221. if self.in_strongs:
  222. self.db.number = "G%s" % data
  223. def endElement(self, name):
  224. """Actions for closing tags."""
  225. if name == "entry":
  226. self.db.add_deriv()
  227. if name == "entry":
  228. self.in_desc = False
  229. if name == "strongs":
  230. self.in_strongs = False
  231. if __name__ == "__main__":
  232. # Configure log level here
  233. logging.basicConfig(level=logging.DEBUG, filename=log_file)
  234. # Initialize the db here
  235. db = StrongsDB(db_file)
  236. # Parse the Hebrew here
  237. hebrew_xml = download(hebrew_source)
  238. logging.info("Parsing Hebrew XML")
  239. hebrew_parser = xml.sax.make_parser()
  240. hebrew_parser.setContentHandler(StrongsHebrewParser(db))
  241. h = open(hebrew_xml)
  242. hebrew_parser.parse(h)
  243. h.close()
  244. db.db_commit()
  245. # Parse the Greek here
  246. greek_zip = download(greek_source)
  247. logging.info("Parsing Greek XML")
  248. greek_parser = xml.sax.make_parser()
  249. greek_parser.setContentHandler(StrongsGreekParser(db))
  250. _zip = zipfile.ZipFile(greek_zip)
  251. greek_parser.parse(StringIO.StringIO(_zip.read("strongsgreek.xml")))
  252. db.db_commit()
  253. # Second pass on the Greek to retrieve missing lemmas in strongs_derivation
  254. logging.info("Finish Greek Strongs derivations")
  255. g2_parser = xml.sax.make_parser()
  256. g2_parser.setContentHandler(StrongsG2Parser(db))
  257. g2_parser.parse(StringIO.StringIO(_zip.read("strongsgreek.xml")))
  258. db.db_commit()
  259. # All Done
  260. logging.info("Finished. sqlite database at %s is ready." % db_file)