{"id":14552,"date":"2022-01-13T04:53:45","date_gmt":"2022-01-12T20:53:45","guid":{"rendered":"https:\/\/achieve.dhcn.cn\/?p=14552"},"modified":"2022-01-22T20:07:33","modified_gmt":"2022-01-22T12:07:33","slug":"using-an-advanced-text-index-structure-for-corpus-exploration-in-digital-humanities","status":"publish","type":"post","link":"https:\/\/achieve.dhcn.cn\/en\/site\/works\/papers\/subject\/literature\/14552.html","title":{"rendered":"Using an Advanced Text Index Structure for Corpus Exploration in Digital Humanities"},"content":{"rendered":"<p>\u4f5c\u8005\uff1aTobias Englmeier, CIS, Ludwig-Maximilians University, Munich, Germany<br>Marco B\u00fcchler, Institute of Computer Science, University of G\u00f6ttingen, G\u00f6ttingen, Germany<br>Stefan Gerdjikov, FMI, University of Sofia &#8220;St. Kliment Ohridski&#8221;, Sofia, Bulgaria<br>Klaus U. Schulz , CIS, Ludwig-Maximilians University, Munich, Germany<\/p>\n\n\n\n<p>\u8f6c\u8f7d\u6765\u6e90\uff1a<em><em>Digital Humanities Quarterly<\/em>,<\/em> 2021,\u00a0Volume 15 Number 1, http:\/\/www.digitalhumanities.org\/dhq\/vol\/15\/1\/000526\/000526.html<\/p>\n\n\n\n<p>\u901a\u8fc7\u9002\u5f53\u7684\u7d22\u5f15\u7ed3\u6784\uff0c\u53ef\u4ee5\u6709\u6548\u5730\u89e3\u51b3\u8bb8\u591a\u8bed\u6599\u5e93\u641c\u7d22\u4efb\u52a1\uff0c\u800c\u65e0\u9700\u5728\u7ebf\u91cd\u65b0\u626b\u63cf\u6587\u672c\u5b58\u50a8\u5e93\u3002\u5728\u672c\u6587\u4e2d\uff0c\u6211\u4eec\u5c55\u793a\u4e86\u5bf9\u79f0\u538b\u7f29\u6709\u5411\u65e0\u73af\u8bcd\u56fe(scdawg)\u2014\u2014\u540e\u7f00\u6811\u7684\u4e00\u79cd\u7ec6\u5316\u2014\u2014\u4e3a\u8bed\u6599\u5e93\u63a2\u7d22\u63d0\u4f9b\u4e86\u4e00\u4e2a\u7406\u60f3\u7684\u57fa\u7840\uff0c\u5e2e\u52a9\u4ee5\u4e00\u79cd\u4f18\u96c5\u7684\u65b9\u5f0f\u56de\u7b54DH\u7814\u7a76\u4e2d\u63d0\u51fa\u7684\u8bb8\u591a\u95ee\u9898\u3002\u4ece\u7b80\u5316\u7684\u89d2\u5ea6\u6765\u770b\uff0cscdawg\u7684\u4f18\u70b9\u4f9d\u8d56\u4e8e\u4e24\u4e2a\u7279\u6027\u3002\u9996\u5148\uff0c\u9700\u8981\u7ebf\u6027\u8ba1\u7b97\u65f6\u95f4\uff0c\u7d22\u5f15\u63d0\u4f9b\u4e86\u5173\u4e8e\u6240\u6709\u6587\u672c\u4e4b\u95f4\u7684\u76f8\u4f3c\u6027(\u5728\u516c\u5171\u5b50\u5b57\u7b26\u4e32\u65b9\u9762)\u548c\u5dee\u5f02\u7684\u8054\u5408\u89c6\u56fe\u3002\u5176\u6b21\uff0c\u7d22\u5f15\u7684\u7ed3\u6784\u89c4\u5f8b\u6709\u52a9\u4e8e\u5728\u4e0d\u4f7f\u7528\u5148\u9a8c\u8bed\u8a00\u77e5\u8bc6\u7684\u60c5\u51b5\u4e0b\uff0c\u4ee5\u4e00\u79cd\u8bed\u8a00\u72ec\u7acb\u7684\u65b9\u5f0f\u6316\u6398\u6587\u672c\u4e2d\u6709\u8da3\u7684\u90e8\u5206(\u5982\u77ed\u8bed\u548c\u6982\u5ff5\u540d)\u53ca\u5176\u5173\u7cfb\u3002\u4e3a\u4e86\u8bc1\u660e\u8fd9\u4e9b\u539f\u5219\u7684\u5a01\u529b\uff0c\u6211\u4eec\u5c06\u7814\u7a76\u6587\u672c\u5bf9\u9f50\u3001\u4e0d\u540c\u6587\u672c\u6216\u4e0d\u540c\u4f5c\u8005\u4e4b\u95f4\u7684\u6587\u672c\u91cd\u7528\u3001\u6982\u5ff5\u7684\u81ea\u52a8\u68c0\u6d4b\u3001\u5386\u65f6\u8bed\u6599\u5e93\u4e2d\u77ed\u8bed\u7684\u65f6\u95f4\u5206\u5e03\u4ee5\u53ca\u76f8\u5173\u95ee\u9898\u3002<\/p>\n\n\n\n<div\n    class='wp-block-pdfp-pdf-poster  alignnone'\n    id='block-1'\n    data-attributes='{&quot;file&quot;:&quot;https:\\\/\\\/achieve.dhcn.cn\\\/wp-content\\\/uploads\\\/2022\\\/01\\\/Using-an-Advanced-Text-Index-Structure-for-Corpus-Exploration-in-Digital-Humanities.pdf&quot;,&quot;align&quot;:&quot;none&quot;,&quot;alignment&quot;:&quot;left&quot;,&quot;title&quot;:&quot;&quot;,&quot;titleFontSize&quot;:&quot;16px&quot;,&quot;height&quot;:{&quot;desktop&quot;:&quot;840px&quot;,&quot;tablet&quot;:&quot;700px&quot;,&quot;mobile&quot;:&quot;400px&quot;},&quot;width&quot;:{&quot;desktop&quot;:&quot;100%&quot;,&quot;tablet&quot;:&quot;100%&quot;,&quot;mobile&quot;:&quot;100%&quot;},&quot;showName&quot;:false,&quot;print&quot;:false,&quot;onlyPDF&quot;:false,&quot;defaultBrowser&quot;:false,&quot;downloadButton&quot;:false,&quot;downloadButtonText&quot;:&quot;Download File&quot;,&quot;fullscreenButton&quot;:true,&quot;fullscreenButtonText&quot;:&quot;View Fullscreen&quot;,&quot;newWindow&quot;:false,&quot;protect&quot;:false,&quot;thumbMenu&quot;:false,&quot;sidebarOpen&quot;:false,&quot;initialPage&quot;:0,&quot;alert&quot;:false,&quot;lastVersion&quot;:false,&quot;hrScroll&quot;:false,&quot;isHideRightToolbar&quot;:false,&quot;additional&quot;:{&quot;ID&quot;:&quot;&quot;,&quot;Class&quot;:&quot;&quot;,&quot;CSS&quot;:&quot;&quot;},&quot;adobeEmbedder&quot;:false,&quot;adobeOptions&quot;:{&quot;showDownloadPDF&quot;:true,&quot;showPrintPDF&quot;:true,&quot;showAnnotationTools&quot;:true,&quot;showFullScreen&quot;:false,&quot;embedMode&quot;:&quot;SIZED_CONTAINER&quot;},&quot;popupOptions&quot;:{&quot;enabled&quot;:false,&quot;text&quot;:&quot;Open PDF&quot;,&quot;btnStyle&quot;:{&quot;background&quot;:&quot;#2271b1&quot;,&quot;color&quot;:&quot;#fff&quot;,&quot;fontSize&quot;:&quot;16px&quot;,&quot;padding&quot;:{&quot;top&quot;:10,&quot;right&quot;:20,&quot;bottom&quot;:10,&quot;left&quot;:10}}},&quot;popupBtnStyle&quot;:{&quot;background&quot;:&quot;#2271b1&quot;,&quot;color&quot;:&quot;#fff&quot;,&quot;padding&quot;:{&quot;top&quot;:10,&quot;right&quot;:20,&quot;bottom&quot;:10,&quot;left&quot;:10}},&quot;popupBtnText&quot;:&quot;Open Document&quot;,&quot;CSS&quot;:&quot;&quot;,&quot;socialShare&quot;:{&quot;enabled&quot;:false,&quot;facebook&quot;:true,&quot;twitter&quot;:true,&quot;linkedin&quot;:true,&quot;pinterest&quot;:true,&quot;position&quot;:&quot;top&quot;},&quot;TrpContentRestriction&quot;:{&quot;restriction_type&quot;:&quot;exclude&quot;,&quot;selected_languages&quot;:[],&quot;panel_open&quot;:true},&quot;isPremium&quot;:false}'\n    style=\"text-align: left\">\n    \n        <iframe loading=\"lazy\" title=\"\" style=\"border:0;\" width=\"100%\" height=\"800px\" class=\"pdfp_unsupported_frame\" src=\"\/\/docs.google.com\/gview?embedded=true&#038;url=https:\/\/achieve.dhcn.cn\/wp-content\/uploads\/2022\/01\/Using-an-Advanced-Text-Index-Structure-for-Corpus-Exploration-in-Digital-Humanities.pdf\"><\/iframe>\n\n    <\/div>\n\n\n\n<p>\u4f5c\u8005\u7b80\u4ecb\uff1a<\/p>\n\n\n\n<p><strong>Tobias Englmeier<\/strong><\/p>\n\n\n\n<p>&nbsp;Tobias Englmeier is a PhD candidate at the Centrum f\u00fcr Informations- und Sprachverarbeitung (CIS) at the Ludwig Maximilians University of Munich. His PhD project is centered around the topics of string matching and OCR postcorrection. Additionally he has been involved in the conception and implementation of numerous Digital Humanities projects coordinated by the IT Gruppe Geisteswissenschaften (ITG) at the Ludwig Maximilians University of Munich.<\/p>\n\n\n\n<p><strong>Marco B\u00fcchler<\/strong>&nbsp;<\/p>\n\n\n\n<p>Marco B\u00fcchler holds a Diploma in Computer Science. From 2006 to 2014 he worked as a Research Associate in the Natural Language Processing Group at Leipzig University. From April 2008 to March 2011 Marco served as the technical Project Manager for the eAQUA project and continued to work in that capacity for the following eTRACES project. In March 2013 he received his PhD in eHumanities. Since May 2014 he leads a Digital Humanities Research Group at the G\u00f6ttingen Centre for Digital Humanities. His research includes Natural Language Processing on Big Humanities Data. Specifically, he works on Historical Text Reuse Detection and its application in the business world. In addition to his primary responsibilities, Marco manages the Medusa project (Big Scale co-occurrence and NGram framework) as well as the TRACER machine for detecting historical text reuse.<\/p>\n\n\n\n<p><strong>Stefan Gerdjikov<\/strong>&nbsp;<\/p>\n\n\n\n<p>Stefan Gerdjikov is an Assistent Professor at the Faculty for Informatics and Mathematics in the University of Sofia. He holds a PhD degree in Mathematics from the University of Sofia. His prime research area is Natural Language Processing where he studies approximate search techniques and index structures for text mining.<\/p>\n\n\n\n<p><strong>Klaus U. Schulz<\/strong>&nbsp;<\/p>\n\n\n\n<p>Klaus U. Schulz is Professor in Computational Linguitics and since 1992 the technical director of the Centrum f\u00fcr Informations- und Sprachverarbeitung (CIS) at the Ludwig Maximilians University of Munich. The work of Professor Schulz concentrates on Semantic Search, Construction of Ontologies and Taxonomies, Digital Libraries, Language Technology for Optical Character Recognition and Document Analysis and Finite-State Technology.<\/p>","protected":false},"excerpt":{"rendered":"<p>\u5728\u672c\u6587\u4e2d\uff0c\u6211\u4eec\u5c55\u793a\u4e86\u5bf9\u79f0\u538b\u7f29\u6709\u5411\u65e0\u73af\u8bcd\u56fe(scdawg)\u2014\u2014\u540e\u7f00\u6811\u7684\u4e00\u79cd\u7ec6\u5316\u2014\u2014\u4e3a\u8bed\u6599\u5e93\u63a2\u7d22\u63d0\u4f9b\u4e86\u4e00\u4e2a\u7406\u60f3\u7684\u57fa\u7840\uff0c\u5e2e\u52a9\u4ee5\u4e00\u79cd\u4f18\u96c5\u7684\u65b9\u5f0f\u56de\u7b54DH\u7814\u7a76\u4e2d\u63d0\u51fa\u7684\u8bb8\u591a\u95ee\u9898\u3002<\/p>","protected":false},"author":10,"featured_media":14554,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":"","_links_to":"","_links_to_target":""},"categories":[3068,2340],"tags":[318,3309,764],"class_list":["post-14552","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-column","category-literature","tag-318","tag-3309","tag-764"],"blocksy_meta":{"styles_descriptor":{"styles":{"desktop":"","tablet":"","mobile":""},"google_fonts":[],"version":6}},"_links":{"self":[{"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/posts\/14552","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/users\/10"}],"replies":[{"embeddable":true,"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/comments?post=14552"}],"version-history":[{"count":3,"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/posts\/14552\/revisions"}],"predecessor-version":[{"id":14643,"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/posts\/14552\/revisions\/14643"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/media\/14554"}],"wp:attachment":[{"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/media?parent=14552"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/categories?post=14552"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/achieve.dhcn.cn\/en\/wp-json\/wp\/v2\/tags?post=14552"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}