This is a working page. Entries are not limited to references, but include
short texts turned up by searches. References are in varied formats (ascii,
bibtex) and from a variety of sites, of which the New Zealand Digital Library
is foremost.
Likely keywords: text
summarization, automatic abstracting, automatic summary, automated
summary, natural language summary, summarizing texts, tailored summary,
condensation;
text skimming, partial parsing,
fragmentary parsing;
tagger.
Search engines/sites:
glimpse,
nzdl,
dortmund,
ncstrl,
bibnet,
altavista,
ucstri,
db&lp_bib
TEXT SUMMARIZATION
from Amit Singhal, February 1995:
Automatic Text Decomposition Using Text Segments and Text
Themes. Gerard Salton, Amit Singhal, Chris Buckley, and Mandar Mitra,
Hypertext '96 (to appear). Also Technical Report TR95-1555, Department
of Computer Science, Cornell University. (Available from
http://www.cs.cornell.edu)
Automatic Text Decomposition and Structuring. Gerard Salton, James
Allan, and Amit Singhal, Information Processing and Management, 32(2),
127-138, 1996.
Automatic Analysis, Theme Generation, and Summarization of Machine
Readable Texts. Gerard Salton, James Allan, Chris Buckley, and Amit
Singhal, Science 264 (3 June, 1994), 1421-1426.
from James Allen, February, 1995:
G. Salton and J. Allan. ``Automatic Text Decomposition and
Structuring''. RIAO '94.
G. Salton and J. Allan. ``Selective Text Utilization and Text
Traversal''. Proceedings of the {\em Fifth Annual ACM Conference on
Hypertext}, November 1993, pp 131-144. Also Cornell Computer Science
Technical Report 93--1366.
G. Salton and J. Allan. ``Selective Text Utilization and Text
Traversal''. International Journal of Human-Computer
Studies, v.{\bf 43}, pp.~483-497, 1995.
G. Salton, C. Buckley, and J. Allan. ``Automatic Structuring and
Retrieval of Large Text Files''. {\em Communications of
the ACM\/}, February, 1994. Also Cornell Computer Science Technical
Report 92--1286.
@InProceedings{McKeown95,
author = "Kathleen McKeown and Dragomir R. Radev",
title = "Generating Summaries of Multiple News Articles",
booktitle = "Proceedings of the Eighteenth Annual International ACM
SIGIR Conference on Research and Development in
Information Retrieval",
series = "Text Summarization",
pages = "74--82",
year = "1995",
copyright = "(c) Copyright 1995 Association for Computing
Machinery",
keywords = "Natural language summarization, Natural language
generation, Summarization of multiple texts",
abstract = "We present a natural language system which summarizes
a series of news articles on the same event. It uses
summarization operators, identified through empirical
analysis of a corpus of news summaries, to group
together templates from the output of the systems
developed for ARPA's Message Understanding Conferences.
Depending on the available resources (e.g., space),
summaries of different length can be produced. Our
research also provides a methodological framework for
future work on the summarization task and on the
evaluation of news summarization systems.",}
@InProceedings{Kupiec95,
author = "Julian Kupiec and Jan Pedersen and Francine Chen",
title = "A Trainable Document Summarizer",
booktitle = "Proceedings of the Eighteenth Annual International ACM
SIGIR Conference on Research and Development in
Information Retrieval",
series = "Text Summarization",
pages = "68--73",
year = "1995",
copyright = "(c) Copyright 1995 Association for Computing
Machinery",
keywords = "Summary sentence, Original documents, Summary pairs,
Training corpus, Document extracts",
abstract = "* To summarize is to reduce in complexity, and hence
in length, while retaining some of the essential
qualities of the original. * This paper focusses on
document extracts, a particular kind of computed
document summary. * Document extracts consisting of
roughly 20\% of the original can be as informative as
the full text of a document, which suggests that even
shorter extracts may be useful indicative summaries. *
The trends in our results are in agreement with those
of Edmundson who used a subjectively weighted
combination of features as opposed to training the
feature weights using a corpus. * We have developed a
trainable summarization program that is grounded in a
sound statistical framework.",}
@InProceedings{Salton93,
author = "Gerard Salton and James Allan",
title = "Selective Text Utilization and Text Traversal",
booktitle = "Proceedings of ACM Hypertext'93",
series = "Papers",
pages = "131--144",
year = "1993",
copyright = "(c) Copyright 1993 Association for Computing
Machinery",
keywords = "Full-text access, Information retrieval, Passage
retrieval, Text analysis, Global text comparisons,
Local context checking, Automatic text linking,
Selective text reading, Text summarization",
abstract = "Many large collections of full-text documents are
currently stored in machine-readable form and processed
automatically in various ways. These collections may
include different types of documents, such as messages,
research articles, and books, and the subject matter
may vary widely. To process such collections, robust
text analysis methods must be used, capable of handling
materials in arbitrary subject areas, and flexible
access must be provided to texts and text excerpts of
varying size. In this study, global text comparison
methods are used to identify similarities between text
elements, followed by local context-checking operations
that resolve ambiguities and distinguish superficially
similar texts from texts that actually cover identical
topics. A linked text structure is then created that
relates similar texts at various levels of detail. In
particular, text links are available for full texts, as
well as text sections, paragraphs, and sentence groups.
The linked structures are usable to identify important
text passages, to traverse texts selectively both
within particular documents and between documents, and
to provide flexible text access to large text
collections in response to various kinds of user needs.
An automated 29-volume encyclopedia is used as an
example to illustrate the text accessing and traversal
operations.",}
@TechReport{CORNELLCS//TR94-1438,
author = "Gerard Salton and Amit Singhal",
title = "Automatic Text Theme Generation and the Analysis of
Text Structure",
institution = "Cornell University, Computer Science Department",
type = "Technical Report",
number = "CORNELLCS//TR94-1438",
pages = "27",
month = jul,
year = "1994",
language = "English",
abstract = "Non-expository texts are not usually read from cover
to cover. Readers are helped in such circumstances by
providing selective access to text excerpts as needed.
Text themes can be identified representing areas of
importance in a text, and summaries can be constructed
automatically. In this study, text theme generation and
text summarization are related to text struture. It is
shown that useful text derivatives are obtainable for
texts with diverse structural characteristics.",}
@Article{Fum86,
author = "Danilo Fum and Giovanni Guida and Carlo Tasso",
title = "Tailoring importance evaluation to reader's goals: a
contribution to descriptive text summarization",
journal = "COLING-86",
pages = "256--259",
year = "1986",}
@InProceedings{ijcai85*840,
author = "Danilo Fum and Giovanni Guida and Carlo Tasso",
title = "Evaluating Importance: {A} Step Towards Text
Summarization",
pages = "840--844",
editor = "Aravind Joshi",
booktitle = "Proceedings of the 9th International Joint Conference
on Artificial Intelligence",
address = "Los Angeles, CA",
month = aug,
year = "1985",
publisher = "Morgan Kaufmann",}
@Article{Fum82,
author = "Danilo Fum and Giovanni Guida and Carlo Tasso",
title = "Forward and backward reasoning in automatic
abstracting",
journal = "COLING-82",
pages = "83--88",
year = "1982",}
1977: S. L. Taylor and G. K. Krulee and L. T. Henschen
Automatic Abstracting of Textual Material
1961: H. P. Edmundson and R. E. Wyllys
Automatic abstracting and indexing, survey and recommendations
SIG/ALP - WHAT KINDS OF TEXT SUMMARY ARE POSSIBLE NOW/ KUKICH K
IN: PROCEEDINGS OF THE ASIS ANNUAL MEETING 1996 Vol.33 Pages 266 -
266
CARLETON LIBRARY HOLDS THIS TITLE
Z699.A1A62
FLOOR 1 SER: v.5- 1968-
LATEST ISSUE RECEIVED: vol.31 / 1994
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
AUTOMATIC SUMMARY; NATURAL LANGUAGE TEXT GENERATION; NATURAL
LANGUAGE SUMMARY; AUTOMATIC ABSTRACTING
AN EXPERIMENT IN THE USE OF TOOLS FOR COMPUTER-ASSISTED ABSTRACTING/
CRAVEN TC
IN: PROCEEDINGS OF THE ASIS ANNUAL MEETING 1996 Vol.33 Pages 203 -
208
CARLETON LIBRARY HOLDS THIS TITLE
Z699.A1A62
FLOOR 1 SER: v.5- 1968-
LATEST ISSUE RECEIVED: vol.31 / 1994
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
Experimental subjects wrote abstracts of an article using a
simplified version of the TEXNET abstracting assistance software.
In addition to the full text, the 35 subjects were presented with
either keywords or phrases extracted automatically. The resulting
abstracts, and the times taken, were recorded automatically; some
additional information was gathered by oral questionnaire.
Results showed considerable variation among subjects, but 37%
found the keywords or phrases ''quite'' or ''very'' useful in
writing their abstracts. Statistical analysis failed to support
several hypothesized relations: phrases were not viewed as
significantly more helpful than keywords; and abstracting
experience did not correlate with originality of wording,
approximation of the author abstract, or greater conciseness.
Results also suggested possible modifications to the software.
RETRIEVAL; DISPLAY; TEXT
THE ART OF ABSTRACTING - CREMMINS,ET NICKUM MJ
IN: LIBRARY JOURNAL 1996 JUN 1 Vol.121 No.10 Pages 160 - 160
CARLETON LIBRARY HOLDS THIS TITLE
Z671.L7
FLOOR 1 SER: v.72-<84-85>- 1947-
LATEST ISSUE RECEIVED: vol.121 no.19+S / 1996 NOV
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
HIGHLIGHTS - LANGUAGE-INDEPENDENT AND DOMAIN-INDEPENDENT
AUTOMATIC-INDEXING TERMS FOR ABSTRACTING (VOL 46, PG 162, 1995)/
COHEN JD
IN: JOURNAL OF THE AMERICAN SOCIETY FOR INFORMATION SCIENCE 1996 MAR
Vol.47 No.3 Pages 260 - 260
CARLETON LIBRARY HOLDS THIS TITLE
Z1001.A42
FLOOR 1 SER: v.21- 1970-
LATEST ISSUE RECEIVED: vol.47 no.11 / NOV 1996
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
ABSTRACTING FROM THE PERSPECTIVE OF TEXT PRODUCTION/ ROTHKEGEL A
IN: INFORMATION PROCESSING & MANAGEMENT 1995 SEP Vol.31 No.5
Pages 777 - 784
CARLETON LIBRARY DOES NOT HOLD THIS TITLE
UNIVERSITY OF OTTAWA LIBRARIES DO NOT HOLD THIS TITLE
This paper takes the view that an abstract itself is a text which
is subjected to general and specific conditions of
text-production. It is assumed that the goal-namely the forming
of the abstract as a text-controls the whole process of
abstracting. This goal-oriented view contrasts to most approaches
in this domain which are source-text oriented. Further,
production strategies are described in terms of text structure
building processes which are re-constructed with methods of
modelling in the area of text-linguistics and computational
linguistics. This leads to a close relationship between the
representation of the model and he resulting text, In this view,
examples are given in which authentical material of abstracts is
analysed according to the model. The model itself integrates
three text levels (content, function, form) which are combined
and represented in terms of the writer's activities.
GENERATING SUMMARIES FROM EVENT DATA MAYBURY MT
IN: INFORMATION PROCESSING & MANAGEMENT 1995 SEP Vol.31 No.5
Pages 735 - 751
CARLETON LIBRARY DOES NOT HOLD THIS TITLE
UNIVERSITY OF OTTAWA LIBRARIES DO NOT HOLD THIS TITLE
Summarization entails analysis of source material, selection of key
information, condensation of this, and generation of a compact
summary form. While there have been many investigations into the
automatic summarization of text, relatively little attention has
been given to the summarization of information from structured
information sources such as data or knowledge bases, despite this
being a desirable capability for a number of application areas
including report generation from databases (e.g. weather,
financial, medical) and simulations (e.g. military,
manufacturing, economic). After a brief introduction indicating
the main elements of summarization and referring to some
illustrative approaches to it, this article considers specific
issues in the generation of text summaries of event data. It
describes a system, SumGen, which selects key information from an
event database by reasoning about event frequencies, frequencies
of relations between events, and domain specific importance
measures. The article describes how SumGen then aggregates
similar information and plans a summary presentation tailored to
a stereotypical user. Finally, the article evaluates SumGen
performance, and also that of a much more limited second
summariser, by assessesing information extraction by 22 human
subjects from both source and summary texts. This evaluation
shows that the use of SumGen reduces average sentence length by
approx. 15%, document length by 70%, and time to perform
information extraction by 58%.
AUTOMATED SUMMARY; NATURAL LANGUAGE GENERATION; IMPORTANCE;
CONDENSATION; AGGREGATION; TAILORED SUMMARY; AUTOMATED ABSTRACTING
HOW TO IMPLEMENT A NATURALISTIC MODEL OF ABSTRACTING - 4 CORE
WORKING STEPS OF AN EXPERT ABSTRACTOR/ ENDRESNIGGEMEYER B
IN: INFORMATION PROCESSING & MANAGEMENT 1995 SEP Vol.31 No.5
Pages 631 - 674
CARLETON LIBRARY DOES NOT HOLD THIS TITLE
UNIVERSITY OF OTTAWA LIBRARIES DO NOT HOLD THIS TITLE
Four working steps taken from a comprehensive empirical model of
expert abstracting are studied in order to prepare an explorative
implementation of a simulation model. It aims at explaining the
knowledge processing activities during professional summarizing,
Following the case-based and holistic strategy of qualitative
empirical research, we develop the main features of the
simulation system by investigating in detail a small but central
test case-four working steps where an expert abstractor discovers
what the paper is about and drafts the topic sentence of the
abstract. Following the KADS methodology of knowledge
engineering, our discussion begins with the empirical model (a
conceptual model in KADS terms) and aims at a computational model
which is implementable without determining the concrete
implementation tools (the design model according to KADS), The
envisaged solution uses a blackboard system architecture with
cooperating object-oriented agents representing cognitive
strategies and a dynamic text representation which borrows its
conceptual relations in particular from RST (Rhetorical Structure
Theory). As a result of the discussion we feel that a small
simulation model of professional summarizing is feasible.
INFORMATION
MAIER E, SIGEL A,
A HYPERTEXT TUTORIAL ON ABSTRACTING FOR LIBRARY-SCIENCE STUDENTS/
KOLTAY T
IN: JOURNAL OF EDUCATION FOR LIBRARY AND INFORMATION SCIENCE 1995
SPR Vol.36 No.2 Pages 170 - 173
CARLETON LIBRARY DOES NOT HOLD THIS TITLE
UNIVERSITY OF OTTAWA LIBRARIES DO NOT HOLD THIS TITLE
DOCUMENTARY ABSTRACTING - TOWARD A METHODOLOGICAL MODEL/
MOLINA MP
IN: JOURNAL OF THE AMERICAN SOCIETY FOR INFORMATION SCIENCE 1995 APR
Vol.46 No.3 Pages 225 - 234
CARLETON LIBRARY HOLDS THIS TITLE
Z1001.A42
FLOOR 1 SER: v.21- 1970-
LATEST ISSUE RECEIVED: vol.47 no.11 / NOV 1996
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
In the general abstracting process (GAP), there are two types of
data: textual, within a particularly framed trilogy (surface,
deep, and rhetoric); and documentary (abstractor, means of
production, and user demands). For its development, the use of
the following disciplines, among others, is proposed: linguistics
(structural, transformational, and textual), logic (formal and
fuzzy), and psychology (cognitive). The model for that textual
transformation is based on a system of combined strategies with
four key stages: reading-understanding, selection,
interpretation, and synthesis.
INFORMATION-RETRIEVAL; SUMMARIZING TEXTS
HIGHLIGHTS - LANGUAGE-INDEPENDENT AND DOMAIN-INDEPENDENT
AUTOMATIC-INDEXING TERMS FOR ABSTRACTING/
COHEN JD
IN: JOURNAL OF THE AMERICAN SOCIETY FOR INFORMATION SCIENCE 1995 APR
Vol.46 No.3 Pages 162 - 174
CARLETON LIBRARY HOLDS THIS TITLE
Z1001.A42
FLOOR 1 SER: v.21- 1970-
LATEST ISSUE RECEIVED: vol.47 no.11 / NOV 1996
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
A method of drawing index terms from text is presented. The
approach uses no stop list, stemmer, or other language- and
domain-specific component, allowing operation in any language or
domain with only trivial modification. The method uses n-gram
counts, achieving a function similar to, but more general than, a
stemmer. The generated index terms, which the author calls
''highlights,'' are suitable for identifying the topic for
perusal and selection. An extension is also described and
demonstrated which selects index terms to represent a subset of
documents, distinguishing them from the corpus. Some experimental
results are presented, showing operation in English, Spanish,
German, Georgian, Russian, and Japanese.
TEXT-RETRIEVAL; ALGORITHMS; SYSTEMS; ACCESS; ERRORS; WORDS
MACHINE-SUPPORTED CONDENSATION OF TECHNICAL TEXTS WITH CONNY -
ABSTRACTING BY TAKING A NACHRICHTEN-FUR-DOKUMENTATION TEXTCORPUS
AS AN EXAMPLE/ RUDA S
IN: NACHRICHTEN FUR DOKUMENTATION 1994 NOV-DEC Vol.45 No.6 Pages
335 - 342
CARLETON LIBRARY DOES NOT HOLD THIS TITLE
UNIVERSITY OF OTTAWA LIBRARIES DO NOT HOLD THIS TITLE
Documents of the journal ''Nachrichten fur Dokumentation'' written
over a twenty-year period (1969-1989) by 50 different authors
have been used as textcorpus. The analysis of the abstracts
revealed that only 15 out of 50 abstracts consist exclusively of
''standard'' abstract sentences and that no abstract satisfies
all requirements of the abstracting guidelines. In this respect,
they signal the abstracting guidelines as ''wishful thinking'',
which supports the idea of machine-supported abstracting by
linguistic features. CONNY is an interactive linguistic
abstracting model for technical texts offering the abstractor
general abstracting guidelines operating on the surface
structure. It condenses the parts of source text assessed as
abstract relevant on source text, sentence and abstract level
with regard to lexic, syntax and semantic.
A THESAURUS FOR USE IN A COMPUTER-AIDED ABSTRACTING TOOL KIT/
CRAVEN TC
IN: PROCEEDINGS OF THE ASIS ANNUAL MEETING 1993 Vol.30 Pages 178 -
184
CARLETON LIBRARY HOLDS THIS TITLE
Z699.A1A62
FLOOR 1 SER: v.5- 1968-
LATEST ISSUE RECEIVED: vol.31 / 1994
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
A multi-purpose thesaurus is among abstracting assistance features
being prototyped in the TEXNET text network management system.
The thesaurus is intended to support vocabulary control,
including production of a variety of printed thesaurus displays,
as well as automatic weighting of passages and Roget-style
suggestion of alternate terms.
INFORMATION-RETRIEVAL
A COMPUTER-AIDED ABSTRACTING TOOL KIT CRAVEN TC
IN: CANADIAN JOURNAL OF INFORMATION AND LIBRARY SCIENCE-REVUE
CANADIENNE DES SCIENCES DE L INFORMATION ET DE BIBLIOTHECONOMIE
1993 JUL Vol.18 No.2 Pages 19 - 31
CARLETON LIBRARY HOLDS THIS TITLE
Z671.C32
FLOOR 1 SER: v.18- 1993-
LATEST ISSUE RECEIVED: vol.21 no.1 / 1996 APR
UNIVERSITY OF OTTAWA LIBRARIES DO NOT HOLD THIS TITLE
Abstracting assistance features are being prototyped in the TEXNET
text network management system. Sentence weighting methods
available include: weighting negatively or positively on the
stems in a selected passage; weighting on general lists of cue
words; adjusting weights of selected segments, and weighting on
occurrences of frequent stems. The user may adjust a number of
parameters: the minimum length of extracts; the threshold for a
''frequent'' word/stem; and the amount a sentence weight is to be
adjusted for each weighting type.
SENTENCE DEPENDENCY STRUCTURES; GRAPHIC DISPLAY
AUTOMATIC-ANALYSIS, THEME GENERATION, AND SUMMARY OF
MACHINE-READABLE TEXTS/ SALTON G
IN: SCIENCE 1994 JUN 3 Vol.264 No.5164 Pages 1421 - 1426
CARLETON LIBRARY HOLDS THIS TITLE
Q1.S35
FLOOR 1 SER: v.<56>, 1927; <87>-<92>- 1938-
FLOOR 1 MICROFORMS MFL: v.1-142, 1895-1963; 151-268, 1966-95
CANCELLED 1995
LATEST ISSUE RECEIVED: no.5290 / 11-15-96
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
Vast amounts of text material are now available in machine-readable
form for automatic processing. Here, approaches are outlined for
manipulating and accessing texts in arbitrary subject areas in
accordance with user needs. In particular, methods are given for
determining text themes, traversing texts selectively, and
extracting summary statements that reflect text content.
RETRIEVAL; HYPERTEXT
ALLAN J, BUCKLEY C, SINGHAL A,
MAIN POINTS IN AN INSTRUCTIONAL TEXT, AS IDENTIFIED BY STUDENTS AND
BY THEIR TEACHERS/ SCHELLINGS GLM
IN: READING RESEARCH QUARTERLY 1995 OCT-DEC Vol.30 No.4 Pages 742
- 756
CARLETON LIBRARY HOLDS THIS TITLE
LB1050.R42
FLOOR 4 SER: v.5- 1969/70-
LATEST ISSUE RECEIVED: vol.31 no.4 / 1996 OCT
UNIVERSITY OF OTTAWA LIBRARIES HOLD THIS TITLE
THREE APPROACHES in identifying main points in instructional texts
can be distinguished: a linguistic, a cognitive-psychological,
and an educational approach. The linguistic approach assumes that
any text contains fixed main points that can be deduced from its
structure. The cognitive-psychological approach assumes that main
points depend primarily on reader variables, such as the reader's
own goals, interests, and previous knowledge. The educational
approach regards main points to be dependent on instructional
variables. In an instructional setting, it is important for
students to assess which parts of the text their teacher
considers important, because the reacher will set the task
demands and test questions about that text. To obtain more
insight into the educational approach, a study was conducted in
which 88 secondary school students and their biology teachers in
the Netherlands were asked to identify main points stated in an
instructional biology text. They were required to underline text
elements that they judged to be important in an instructional
situation. There was a large variation in the number of main
points underlined by the students and even by the teachers. There
was also a large variation in the agreement or correspondence
between students and their leachers. The students mentioned
different kinds of reasons for selecting main points. It appeared
that these reasons could be categorized on the basis of tile
three forementioned approaches. Students who matched closely with
their teacher mentioned educational reasons more often than
students who marched poorly with their teacher. The educational
approach could be a useful contribution in main idea
comprehension research.
SUMMARIZING TEXTS; WRITTEN SUMMARIES; STORY
VANHOUTWOLTERS BHAM,
TEXT SKIMMING
Michael Mauldin, CMU Computer Science:
Information Retrieval by Text Skimming.
ABSTRACT
I will report on the progress I have made for my thesis entitled
``Information Retrieval by Text Skimming.''
Most information retrieval systems today are word based. But simple
word searches and frequency distributions do not provide these systems
with an understanding of their texts. Full natural language parsers
are capable of deep understanding within limited domains, but are too
brittle and slow for general information retrieval.
My dissertation is an attempt to bridge this gap by using a text
skimming parser as the basis for an information retrieval system that
partially understands the texts stored in it. The objective is to
develop a system capable of retrieving a significantly greater fraction
of relevant documents than is possible with a keyword based approach,
without retrieving a larger fraction of irrelevant documents. As part
of my dissertation, I am implementing a full-text information retrieval
system called FERRET (Flexible Expert Retrieval of Relevant English
Texts). FERRET will provide information retrieval for the UseNet News
system, a collection of 247 news groups covering a wide variety of
topics. Currently FERRET reads SCI.ASTRO, the Astronomy news group,
and part of my investigation will be to demonstrate the addition of new
domains with only minimal hand coding of domain knowledge. FERRET will
acquire the details of a domain automatically using a script learning
component.
Information Retrieval by Text Skimming, PhD Thesis, Carnegie
Mellon University. August, 1989 (also available as CMU Computer
Science technical report CMU-CS-89-193)
10. M. Mauldin, "Information Retrieval by Text Skimming," doctoral
dissertation, Carnegie Mellon Univ., Pittsburgh, Aug. 1989. (Also
available as CMU Tech. Report CMU-CS-89-193.) Revised edition
published as "Conceptual Information Retrieval: A Case Study in
Adaptive Partial Parsing," Kluwer Academic Publishers, Boston,
Mass., Sept. 1991.
Klaus-Peter Gores with Rainer Bleisinger
DFKIBIB DFKI Publications Technical Memos Year 94
_______________________________________________________________
TM-94-01
Text Skimming as a Part in Paper Document Understanding
14 Pages
Abstract
In our document understanding project ALV we analyse
incoming paper mail in the domain of single-sided German
business letters. These letters are scanned and after
several analysis steps the text is recognized. The result
may contain gaps, word alternatives, and even illegal
words. The subject of this paper is the subsequent phase
which concerns the extraction of important information
predefined in our "message type model". An expectation
driven partial text skimming analysis is proposed
focussing on the kernel module, the so-called
"predictor". In contrast to traditional text skimming the
following aspects are important in our approach.
Basically, the input data are fragmentary texts. Rather
than having one text analysis module ("substantiator")
only, our predictor controls a set of different and
partially alternative substantiators.
With respect to the usually proposed three working phases
of a predictor -- start, discrimination, and
instantiation -- the following differences are
remarkable. The starting problem of text skimming is
solved by applying specialized substantiators for
classifying a business letter into message types. In
order to select appropriate expectations within the
message type hypotheses a twofold discrimination is
performed. A coarse discrimination reduces the number of
message type alternatives, and a fine discrimination
chooses one expectation within one or a few previously
selected message types. According to the expectation
selected substantiators are activated. Several rules are
applied both for the verification of the substantiator
results and for error recovery if the results are
insufficient.
@InProceedings{ijcai77*16,
author = "G. DeJong",
title = "Skimming Newspaper Stories by Computer",
pages = "16--16",
editor = "Raj Reddy",
booktitle = "Proceedings of the 5th International Joint Conference
on Artificial Intelligence",
address = "Cambridge, MA",
month = aug,
year = "1977",
publisher = "William Kaufmann", }
TAGGER
@InProceedings{cutting-kupiec-pedersen-sibun-92,
author = "Doug Cutting and Julian Kupiec and Jan Pedersen and
Penelope Sibun",
title = "A Practical part-of-speech tagger",
booktitle = "Proceedings of the Third Conference on Applied Natural
Language Processing",
year = "1992",