Viewing file: NITF.php (22.81 KB) -rw-rw-rw- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
<?php
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
/** * NITF XML Parser * * PHP versions 4 and 5 * * LICENSE: This source file is subject to version 3.0 of the PHP license * that is available through the world-wide-web at the following URI: * http://www.php.net/license/3_0.txt. If you did not receive a copy of * the PHP License and are unable to obtain it through the web, please * send a note to license@php.net so we can mail you a copy immediately. * * @category XML * @package XML_NITF * @author Patrick O'Lone <polone@townnews.com> * @copyright 1997-2005 The PHP Group * @license http://www.php.net/license/3_0.txt PHP License 3.0 * @version CVS: NITF.php,v 1.13 2005/12/09 14:51:04 polone Exp * @link http://pear.php.net/package/XML_NITF/ */
/** * Include the XML_Parser class as the base class */ require_once ('XML/Parser.php');
// {{{ XML_NITF
/** * Simple NITF Parser * * This class provides basic NITF parsing. Many of the major elements of the NITF * standard are supported. This implementation is based off the NITF 3.1 DTD, * publicly available at the following URL: * * http://www.nitf.org/site/nitf-documentation/nitf-3-1.dtd * * Note that not all elements of this standard are not supported. * * <sample> * <?php * * require_once("XML/NITF.php"); * * $oNITF =& new XML_NITF(); * $oNITF->setInputFile("nitf.xml"); * $xResult = $oNITF->parse(); * if (PEAR::isError($xResult)) { * die("Parsing failed: ".$xResult->getMessage()); * } * * echo $oNITF->getHeadline(); * echo $oNITF->getByline(); * * ?> * </sample> * * @category XML * @package XML_NITF * @author Patrick O'Lone <polone@townnews.com> * @copyright 1997-2005 The PHP Group * @license http://www.php.net/license/3_0.txt PHP License 3.0 * @version Release: 1.0.2 * @link http://pear.php.net/package/XML_NITF */ class XML_NITF extends XML_Parser { // {{{ properties /** * Meta tag properties retrieved from document head section * @see getMetaData() * @var array * @access private */ var $m_kMeta = array();
/** * Document Metadata * * Container for metadata information about this particular document. * * @see getDocData() * @var array * @access private */ var $m_kDocData = array ('key-list' => array ());
/** * Specific Publication Data * * Information about specific instance of an item's publication. Contains * metadata about how the particular news object was used in a specific * instance. * * @see getPubData() * @var array * @access private */ var $m_kPubData = array ();
/** * Document Revisions * * Information about the creative history of the document; also used as an * audit trail. Includes who made changes, when the changes were made, and * why. Each element of the array is a key-based array that corresponds to * the <revision-history> element. * * @var array * @see getRevision() * @access private */ var $m_akRevisions = array ();
/** * Document Headlines * * The various headlines that were found in the document. The headlines are * keyed by the levels of HLX. The default hedline (if no level is found) is * HL1. * @var array * @see getHedlines() * @access private */ var $m_kHedlines = array ('HL1' => null, 'HL2' => array ());
/** * Abstract * * Story abstract summary or synopsis of the contents of the document. * @var string * @access private */ var $m_sAbstract = null;
/** * @var string * Significant place mentioned in an article. Used to normalize locations. * The location in this variable is the place where the story's events will * or have unfolded. * @access private */ var $m_sLocation = null;
/** * @var string * Information distributor. May or may not be the owner or creator. * @access private */ var $m_sDistributor = null;
/** * @var string * The elements of the byline, including the author's name and title. * @see getByline() * @access private */ var $m_kByline = array ('author' => null, 'title' => null);
/** * @var array * An array of paragraphs extracted from the document * @see getLede(), getContent() * @access private */ var $m_aContent = array ();
/** * @var array * A list of media reference elements as found in the body section of the * document. Each element is an array itself with keyed properties related * to media element in question. * @see getMedia() * @access private */ var $m_aMedia = array ();
/** * @var array * A list of tags that were parsed (in order) denoting the current sequence * of tags that were parsed. This is array is used for parsing the document * elements in a particular order (if needed). * @see StartHandler(), EndHandler(), cdataHandler() * @access private */ var $m_aParentTags = array ();
/** * A byline at the end of a story. Example: Stuart Myles contributed to this * article. * @var string * @see getTagline() * @access private */ var $m_sTagline = null;
/** * Free-form bibliographic data. Used to elaborate on the source of * information. * @var string * @see getBibliography() * @access private */ var $m_sBibliography = null;
// }}} // {{{ getDocData()
/** * Access all or specific elements of the <docdata> block * * @param string $sProperty The property of the <docdata> block to return, the * most common being: * +"doc-id" - a unique identifier of this document * (string) * +"key-list" - a list of keywords provided with * the document (array) * +"copyright" - the copyright holder (string) * +"series" - if the document is part of series * (string) * +"urgency" - a number between 1 (urgent) and 8 * (not urgent) (integer) * +"date.issue" - date the document was issued * (UNIX timestamp) * +"date.release" - date the document is publicly * available (UNIX timestamp) * +"date.expires" - date the document is no longer * valid (UNIX timestamp) * * @return mixed All of the elements from the <docdata> block will be returned * if a specific property is not provided. If a specific property * is requested and is found in the docdata block, then that * property will be returned. If the property cannot be found, * null is returned. * * @see getDocDataElement() * @access public */ function getDocData($sProperty = null) { if (!empty ($sProperty)) {
$sProperty = strtolower($sProperty); if (isset ($this->m_kDocData[$sProperty])) {
return $this->m_kDocData[$sProperty];
} return null;
} return $this->m_kDocData; }
// }}} // {{{ getMetaData() /** * Retrieve meta data from the NITF file * @return array Returns an array of key/value pairs from the meta section * @access public */ function getMetaData() { return $this->m_kMeta; } // }}} // {{{ getPubData()
/** * Returns all elements or a specific element from the <pubdata> block * * @param string $sProperty The publication property being retrieved * @return mixed Returns string, numeric, or array values depending on the * property being accessed from the <pubdata> block. * * @access public */ function getPubData($sProperty = null) { if (!empty ($sProperty)) {
$sProperty = strtolower($sProperty); if (isset ($this->m_kPubData[$sProperty])) {
return $this->m_kPubData[$sProperty];
} return null;
}
return $this->m_kPubData; }
// }}} // {{{ getRevision()
/** * Get the revision history * * @return array An array containing key-value arrays. The properties of each * array element in this array are: * * +"comment" - Reason for the revision * +"function" - Job function of individual performing revision * +"name" - Name of the person who made the revision * +"norm" - Date of the revision * @access public */ function getRevision() { return $this->m_akRevisions; }
// }}} // {{{ getHeadline()
/** * Retrieve all headlines or a single headline denoted by key * * @param integer $nLevel The key value corresponding to the headline to be * retrieved * @return mixed Returns an array if no specific headline element is requested, * or a string if the specific headline element requested exists * @access public */ function getHeadline($nLevel = 1) { return $this->m_kHedlines["HL$nLevel"]; }
// }}} // {{{ getByline()
/** * Return information about the author of a document * * @param string $sProperty The field of the byline to retrieve. * @return string The entire byline as we found in the document * @access public */ function getByline($sProperty = 'author') { $sProperty = strtolower($sProperty); if (isset ($this->m_kByline[$sProperty])) {
return $this->m_kByline[$sProperty];
}
return null; }
// }}} // {{{ getMedia()
/** * Query for a list of related media elements * * @param string $sProperty If supplied, only this property will be returned * for each element of the media reference array. * @return array Returns an array of all media reference data, or an array of * select media reference data determined by the property * parameter passed. * @access public */ function getMedia($sProperty = null) { if (empty ($sProperty)) {
return $this->m_aMedia;
} else {
$aMediaRefs = array (); foreach ($this->m_aMedia as $aMediaElem) {
if (isset ($aMediaElem[$sProperty])) {
array_push($aMediaRefs, $aMediaElem[$sProperty]);
} }
return $aMediaRefs;
} }
// }}} // {{{ getLede()
/** * Returns the lede (sometimes called lead) paragraph * * @return string Returns the lede paragraph if it is defined, or null otherwise * @access public */ function getLede() { if (isset ($this->m_aContent[0])) {
return $this->m_aContent[0];
} return null; }
// }}} // {{{ getContent()
/** * Returns the paragraphs of content * * @return array An array of elements that represent a single paragraph each * @access public */ function & getContent() { return $this->m_aContent; }
// }}} // {{{ getTagLine()
/** * Returns the tag line (if one exists) * * @return string The tag line extracted from the NITF data source * @access public */ function getTagline() { return $this->m_sTagline; }
// }}} // {{{ getBibliography()
/** * Returns the free-form bibliographic data * * @return string The bibliography (if one exists) is returned * @access public */ function getBibliography() { return $this->m_sBibliography; }
// }}} // {{{ toString()
/** * Get a string version of the article * * @param string $sCRLF The character(s) used to separate each article * element in the string that is returned - often * referred to as the CRLF. * @return string A string representing the main headline, author, content, * and tagline. * @access public */ function & toString($sCRLF = "\n") { $sArticle = "{$this->m_kHedlines['HL1']}$sCRLF";
if (!empty ($this->m_kByline['author'])) {
$sArticle .= "{$this->m_kByline['author']}$sCRLF";
}
if (!empty ($this->m_sLocation)) {
$sArticle .= "{$this->m_sLocation} - ";
}
$sArticle .= join($sCRLF, $this->m_aContent);
if (!empty ($this->m_sTagline)) {
$sArticle .= "$sCRLF{$this->m_sTagline}";
}
return $sArticle; }
// }}} // {{{ StartHandler()
/** * Handle start XML elements and attributes * * @param object $oParser The XML parser object instance that was inherited * from the XML_Parser class * @param string $sName A tag element from the XML data stream * @param array $kAttrib An array of XML attributes associated with the given * tag supplied * @return void * @access private */ function StartHandler($oParser, $sName, $kAttrib) { // Push the element into the stack of XML elements already visited
array_push($this->m_aParentTags, $sName);
// Handle the attributes of the XML tags
switch ($sName) {
case 'HL2' : $this->_sHedline = null; break;
case 'P' : if (!empty ($kAttrib['LEDE']) && ($kAttrib['LEDE'] == 'true')) {
$this->_bIsLede = true;
} $this->_sContent = null; break;
case 'DOC.COPYRIGHT' : $this->m_kDocData['copyright'] = $kAttrib['HOLDER']; break;
case 'MEDIA' : $this->_kMedia = array (); if (!empty ($kAttrib['MEDIA-TYPE'])) {
$this->_kMedia['type'] = $kAttrib['MEDIA-TYPE'];
} else {
$this->_kMedia['type'] = 'other';
}
$this->_kMedia['source'] = null; $this->_kMedia['mime-type'] = null; $this->_kMedia['caption'] = null; $this->_kMedia['data'] = null; $this->_kMedia['encoding'] = null; $this->_kMedia['producer'] = null; $this->_kMedia['meta'] = array (); break;
case 'MEDIA-REFERENCE' : if (!empty ($kAttrib['SOURCE'])) {
$this->_kMedia['source'] = $kAttrib['SOURCE'];
// Compatibility with the AP Usenet feed - note that this is a non // standard attribute and is NOT a part of NITF standards
} elseif (!empty ($kAttrib['DATA-LOCATION'])) {
$this->_kMedia['source'] = $kAttrib['DATA-LOCATION'];
}
$this->_kMedia['mime-type'] = $kAttrib['MIME-TYPE']; break;
case 'MEDIA-OBJECT' : $this->_kMedia['encoding'] = $kAttrib['ENCODING']; break;
case 'MEDIA-METADATA' : if (!empty ($kAttrib['NAME'])) {
$this->_kMedia[$kAttrib['NAME']] = $kAttrib['VALUE'];
} break;
case 'PUBDATA' : foreach ($kAttrib as $sKey => $sValue) {
$this->m_kPubData[strtolower($sKey)] = $sValue;
} break;
case 'DOC-ID' : $this->m_kDocData['doc-id'] = $kAttrib['ID-STRING']; break;
// NITF 3.0 extension - added per request by Lars Schenk // (info@lars-schenk.de). Document urgency status information.
case 'URGENCY' : $this->m_kDocData['urgency'] = $kAttrib['ED-URG']; break;
// The list of keywords or phrases are just added to the array of // keywords.
case 'KEYWORD' : if (empty ($this->m_kDocData['key-list'])) {
$this->m_kDocData['key-list'] = array ();
}
array_push($this->m_kDocData['key-list'], $kAttrib['KEY']); break;
// The release, expiration, and issuing dates of this article. The // ISO-8601 time stamp settings are preserved, but you can use the // magic function strtotime() to convert these to time stamp values.
case 'DATE.RELEASE' : case 'DATE.EXPIRE' : case 'DATE.ISSUE' : if (!empty ($kAttrib['NORM'])) {
$sName = strtolower($sName); $this->m_kDocData[$sName] = $kAttrib['NORM'];
} break;
case 'REVISION-HISTORY' : array_push($this->m_akRevisions, array_change_key_case($kAttrib, CASE_LOWER)); break; case 'META': if (!empty($kAttrib['NAME']) && isset($kAttrib['CONTENT'])) { $sName = strtolower($kAttrib['NAME']); $this->m_kMeta[$sName] = $kAttrib['CONTENT']; } break;
}
}
// }}} // {{{ EndHandler()
/** * Handle XML tag closing state * * @param object $oParser The parser object parsing the XML data * @param string $sName The name of the tag element that has just ended * @return void * @access private */ function EndHandler($oParser, $sName) { switch ($sName) {
case 'HL1' : $this->m_kHedlines['HL1'] = trim($this->m_kHedlines['HL1']); break;
case 'HL2' : array_push($this->m_kHedlines['HL2'], trim($this->_sHedline)); unset ($this->_sHedline); break;
case 'P' : if (isset ($this->_bIsLede)) {
array_unshift($this->m_aContent, trim($this->_sContent)); unset ($this->_bIsLede);
} else {
array_push($this->m_aContent, trim($this->_sContent));
} unset ($this->_sContent); break;
case 'MEDIA' : array_push($this->m_aMedia, $this->_kMedia); unset ($this->_kMedia); break;
}
array_pop($this->m_aParentTags); }
// }}} // {{{ cdataHandler()
/** * Parses CDATA chunks * * @param object $oParser The XML parser instance inherited from the * XML_Parser class * @param string $sData The data chunk to be processed from the parser * @return void * @access private */ function cdataHandler($oParser, $sData) { if (!in_array('MEDIA-OBJECT', $this->m_aParentTags)) {
$sData = preg_replace('#\s+#', ' ', $sData);
}
// Elements that can be found in the BODY.HEAD section of the NITF // document are defined in this handler.
if (in_array('BODY.HEAD', $this->m_aParentTags)) {
// We don't care if they use other attribute items, we just want the // textual version of the byline. Other attributes are appended to // the byline data.
if (in_array('BYLINE', $this->m_aParentTags)) {
if (in_array('BYTTL', $this->m_aParentTags)) {
$this->m_kByline['title'] .= $sData; return;
}
$this->m_kByline['author'] .= $sData; return;
}
// Generally, the distributor is the same as the company supplying // the content. However, this is not always the case (the AP, for // example).
if (in_array('DISTRIBUTOR', $this->m_aParentTags)) {
$this->m_sDistributor .= $sData; return;
}
// The location where the story pertains too.
if (in_array('DATELINE', $this->m_aParentTags)) {
if (in_array('LOCATION', $this->m_aParentTags)) {
$this->m_sLocation .= $sData;
} return; }
// There are only two possibilities for hedlines, the main headline // or a subheadline.
if (in_array('HEDLINE', $this->m_aParentTags)) {
if (in_array('HL2', $this->m_aParentTags)) {
$this->_sHedline .= $sData;
} else {
$this->m_kHedlines['HL1'] .= $sData;
}
} return;
}
// The article content, including the lead and following paragraphs, can // be found in this section of the XML document.
if (in_array('BODY.CONTENT', $this->m_aParentTags)) {
if (in_array('MEDIA', $this->m_aParentTags)) {
// The media caption for the currently selected media element.
if (in_array('MEDIA-CAPTION', $this->m_aParentTags)) {
$this->_kMedia['caption'] .= $sData; return;
}
if (in_array('MEDIA-OBJECT', $this->m_aParentTags)) {
$this->_kMedia['data'] .= $sData; return;
}
}
// A paragraph element was found.
if (in_array('P', $this->m_aParentTags)) {
$this->_sContent .= $sData; return;
}
}
// The <body.end> tag has two primary elements, <taglines> and the free // form <bibliography> tags.
if (in_array('BODY.END', $this->m_aParentTags)) {
if (in_array('TAGLINE', $this->m_aParentTags)) {
$this->m_sTagline .= $sData; return;
}
if (in_array('BIBLIOGRAPHY', $this->m_aParentTags)) {
$this->m_sBibliography .= $sData;
}
}
}
// }}}
}
// }}}
/* * Local variables: * tab-width: 4 * c-basic-offset: 4 * c-hanging-comment-ender-p: nil * End: */ ?>
|