server-php-xml-class-rss-parser-class-rdf-parser.php / php
<?php //#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
###
// Title : Class Rdf_parser // Version : 1.0 // Author : Jason Diammond -repat RDF parser- // : Luis Argerich -PHP version of repat- (lrargerich@yahoo.com) // Last modification date : 06-13-2002 // Description : A port to PHP of the Repat an RDF parser. // This parser based on expat parses RDF files producing events // proper of RDF documents. //#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
###
// History: // 06-13-2002 : First version of this class. // 07-17-2002 Minor bugfix (Leandro Mariano Lopez) //#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
###
// To-Dos: // //#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
###
// How to use it: // Read the documentation in rdf_parser.html //#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
###
if(defined("_class_rdf_is_included")) { // do nothing since the class is already included } else { define("_class_rdf_is_included",1); /* First we define some constants */ define("XML_NAMESPACE_URI","http://www.w3.org/XML/1998/namespace" ); define("XML_LANG","lang"); define("RDF_NAMESPACE_URI","http://www.w3.org/1999/02/22-rdf-syntax-ns#" ); define("RDF_RDF","RDF"); define("RDF_DESCRIPTION","Description"); define("RDF_ID","ID"); define("RDF_ABOUT","about"); define("RDF_ABOUT_EACH","aboutEach"); define("RDF_ABOUT_EACH_PREFIX","aboutEachPrefix"); define("RDF_BAG_ID","bagID"); define("RDF_RESOURCE","resource"); define("RDF_VALUE","value"); define("RDF_PARSE_TYPE","parseType"); define("RDF_PARSE_TYPE_LITERAL","Literal"); define("RDF_PARSE_TYPE_RESOURCE","Resource"); define("RDF_TYPE","type"); define("RDF_BAG","Bag"); define("RDF_SEQ","Seq"); define("RDF_ALT","Alt"); define("RDF_LI","li"); define("RDF_STATEMENT","Statement"); define("RDF_SUBJECT","subject"); define("RDF_PREDICATE","predicate"); define("RDF_OBJECT","object"); define("NAMESPACE_SEPARATOR_CHAR",'^'); define("NAMESPACE_SEPARATOR_STRING","^"); //define("FALSE",0); //define("TRUE",1); define("IN_TOP_LEVEL",0); define("IN_RDF",1); define("IN_DESCRIPTION",2); define("IN_PROPERTY_UNKNOWN_OBJECT",3); define("IN_PROPERTY_RESOURCE",4); define("IN_PROPERTY_EMPTY_RESOURCE",5); define("IN_PROPERTY_LITERAL",6); define("IN_PROPERTY_PARSE_TYPE_LITERAL",7); define("IN_PROPERTY_PARSE_TYPE_RESOURCE",8); define("IN_XML",9); define("IN_UNKNOWN",10); //define("IN_PROPERTY_PARSE_TYPE_LITERAL",9); define("RDF_SUBJECT_TYPE_URI",0); define("RDF_SUBJECT_TYPE_DISTRIBUTED",1); define("RDF_SUBJECT_TYPE_PREFIX",2); define("RDF_SUBJECT_TYPE_ANONYMOUS",3); define("RDF_OBJECT_TYPE_RESOURCE",0); define("RDF_OBJECT_TYPE_LITERAL",1); define("RDF_OBJECT_TYPE_XML",2); class Rdf_parser { var /* Private Methods */ function _new_element() { e["parent"]=Array(); // Parent is a blank Array //e["parent"]); e["has_property_atributes"]=0; e["subject_type"]=0; e["predicate"]=''; e["members"]=0; e["xml_lang"]=''; e["statements"]=0; e; } function _copy_element(destination ) { if( destination["parent"] = destination["state"] = destination["xml_lang"] = e) { e["predicate"]=''; e["bag_id"]=''; e["parent"])) { if( e["parent"]["xml_lang"] != e["xml_lang"]=''; } } else { e["xml_lang"]=''; } //memset( e, 0, strlen( _rdf_element ) ); e["state"]=0; e["has_member_attributes"]=0; e["subject"]=''; e["ordinal"]=0; e["data"]=''; e["bag_id"]=''; e["statement_id"]=''; } function _push_element() { if(!isset(this->rdf_parser["free"]=Array(); } if(count(e = e["parent"])) { e["parent"]; } else { this->_new_element(); } } else { this->_new_element(); } if(!isset(this->rdf_parser["top"]=Array(); } this->rdf_parser["top"], this->rdf_parser["top"] = e = this->rdf_parser["top"] = this->_clear_element( /* if(isset(this->rdf_parser["free"])) { this->rdf_parser["free"]; } else { this->rdf_parser["free"] = local_name ) { return ( local_name ) { return ( local_name ) { //if(local_name{0}=='_') if( ordinal = substr(ordinal > 0 ) ? local_name ) { return local_name ) || local_name ); } function _is_rdf_property_element( local_name == RDF_TYPE ) || ( local_name == RDF_PREDICATE ) || ( local_name == RDF_VALUE ) || ( local_name{0} == '_' ); } function _istalnum(val); } function _istalpha(val); } function _is_absolute_uri(result = false; uri && uri{uri_p; while( (<strlen(uri)) && ( uri{uri{uri{uri{uri_p; } uri{result; } /* This function returns an associative array returning any of the various components of the URL that are present. This includes the url) scheme - e.g. http host port user pass path query - after the question mark ? fragment - after the hashmark # */ function _parse_uri(buffer,scheme,&path,&fragment ) { uri); if(isset(scheme=scheme=''; } if(isset(host=host=''; } if(isset(authority=authority=''; } if(isset(path=path=''; } if(isset(query=query=''; } if(isset(fragment=fragment=''; } } function _resolve_uri_reference(reference_uri,&length ) { reference_buffer=''; buffer = ''; reference_uri,reference_buffer ),reference_authority, reference_query,reference_scheme == '' && reference_path == '' && buffer=reference_fragment != '' ) { buffer.=reference_scheme != '' ) { reference_uri; } else { base_uri, base_buffer ), base_authority, base_query, result_scheme = reference_authority != '' ) { reference_authority; } else { base_authority; if( reference_path{0} == '/') || (result_path = p = ''; path_buffer; p = strstr( p ) { base_path, '\\' ); } if( path_buffer.=//while( s <= p ) //{ // *d++ = *s++; //} //*d++ = 0; } if( reference_path != '' ) { reference_path; } { //remove all occurrences of "./" //print(path_buffer=preg_replace("/\/\.\//","/",path_buffer=preg_replace("/\/([^\/\.])*\/..path_buffer); while(preg_match("/\.\./",path_buffer=preg_replace("/\/([^\/\.]*)\/..\//","/",path_buffer=preg_replace("/\.path_buffer); } } } // This replaces the C pointer assignament path_buffer; if( buffer=buffer.=":"; } if( buffer.="//"; result_authority; } if( buffer.=reference_query != '' ) { buffer.=reference_fragment != '' ) { buffer.=id ) { p = p_p=0; if( this->_istalpha( p{0} == '_' || result = true; while( p{++this->_istalnum( p_p} ) || p_p} == '.' || p_p} == '-' || p_p} == '_' || p_p} == ':' ) ) { result; } function _resolve_id(buffer,id_buffer=''; if( id ) == true ) { id"; } else { id_buffer."#_bad_ID_attribute_"); } this->rdf_parser["base_uri"], buffer, name, &len,&local_name ) { static buffer=buffer, NAMESPACE_SEPARATOR_CHAR ) ) { buffer); cosas[0]; cosas[1]; } else { if( ( buffer{ 1 } == 'm' ) && ( buffer{ 3 } == ':' ) ) { local_name = substr(namespace_uri = ''; buffer; } } } function _generate_anonymous_uri(&len ) { this->rdf_parser["anonymous_id"])) { this->rdf_parser["anonymous_id"]++; this->rdf_parser["anonymous_id"]; this->rdf_parser["base_uri"], buf, subject_type, predicate, object_type, xml_lang, statements, statement_id_type = RDF_SUBJECT_TYPE_URI; predicate_buffer=''; if( this->rdf_parser["statement_handler"](subject_type,predicate,object_type,xml_lang ); if( statements == '' ) { bag_id, RDF_NAMESPACE_URI.RDF_TYPE, 0, RDF_OBJECT_TYPE_RESOURCE, RDF_NAMESPACE_URI.RDF_BAG, '', '', '', '' ); } if( ! statement_id_type = RDF_SUBJECT_TYPE_ANONYMOUS; statement_id_buffer, strlen( statement_id = statements++; statements; bag_id, statements, RDF_OBJECT_TYPE_RESOURCE, statement_id ) { // rdf:type = rdf:Statement statement_id_type, this->_report_statement( statement_id, RDF_NAMESPACE_URI.RDF_SUBJECT, 0, RDF_OBJECT_TYPE_RESOURCE, this->_report_statement( statement_id, RDF_NAMESPACE_URI.RDF_PREDICATE, 0, RDF_OBJECT_TYPE_RESOURCE, this->_report_statement( statement_id, RDF_NAMESPACE_URI.RDF_OBJECT, 0, object, '', '', '', '' ); } } } function _report_start_parse_type_literal() { if( this->rdf_parser["start_parse_type_literal_handler"]( this->rdf_parser["end_parse_type_literal_handler"] ) { this->rdf_parser["user_data"] ); } } function _handle_property_attributes(subject, xml_lang, statements ) { attribute=''; attribute_namespace_uri=''; attribute_value=''; i = 0; isset(i ]); this->_split_name( i ], attribute ), attribute_local_name ); attributes[ predicate=predicate.=attribute_namespace_uri ) { if( attribute_local_name ) ) { subject_type, predicate, 0, RDF_OBJECT_TYPE_LITERAL, xml_lang, statements, '' ); } else if( attribute_local_name ) ) { subject_type, predicate, 0, RDF_OBJECT_TYPE_RESOURCE, bag_id, ordinal = attribute_local_name ) ) != 0 ) { subject_type, predicate, attribute_value, bag_id, attribute_namespace_uri ) { //do nothing } else if( this->_report_statement( subject, attribute_value, bag_id, name, this->rdf_parser["start_element_handler"]) ) { this->rdf_parser["user_data"], attributes ); } } function _report_end_element( this->rdf_parser["end_element_handler"]) ) { this->rdf_parser["user_data"], s,this->rdf_parser["character_data_handler"]) ) { this->rdf_parser["user_data"], len ); } } function _report_warning( this->rdf_parser["warning_handler"]) ) { warning); } } function _handle_resource_element( local_name, parent ) { aux=aux2=Array(); foreach(atkey=>aux2[]=aux2[]=attributes=id = ''; about_each = ''; bag_id = ''; attribute=''; attribute_local_name=''; id_buffer=''; this->rdf_parser["top"]["has_property_attributes"] = false; i = 0; isset(i]); this->_split_name( i ], attribute ), attribute_local_name ); attributes[ attribute_namespace_uri == '' ) || ( attribute_local_name == RDF_ID ) { attribute_value; ++attribute_local_name == RDF_ABOUT ) { attribute_value; ++attribute_local_name == RDF_ABOUT_EACH ) { attribute_value; ++attribute_local_name == RDF_ABOUT_EACH_PREFIX ) { attribute_value; ++attribute_local_name == RDF_BAG_ID) { attribute_value; } else if( attribute_local_name ) ) { this->_is_rdf_ordinal( this->rdf_parser["top"]["has_property_attributes"] = true; this->_report_warning( "unknown or out of context rdf attribute:".attribute_namespace_uri == XML_NAMESPACE_URI ) { if( this->rdf_parser["top"]["xml_lang"] = attribute_namespace_uri ) { subjects_found == 0 ) { id_buffer, strlen( this->rdf_parser["top"]["subject"]=this->rdf_parser["top"]["subject_type"] = RDF_SUBJECT_TYPE_ANONYMOUS; } else if( this->_report_warning( "ID, about, aboutEach, and aboutEachPrefix are mutually exclusive" ); return; } else if( this->_resolve_id( id_buffer, strlen( this->rdf_parser["top"]["subject_type"] = RDF_SUBJECT_TYPE_URI; id_buffer; } else if( this->_resolve_uri_reference( about, id_buffer ) ); this->rdf_parser["top"]["subject"]=about_each ) { this->rdf_parser["top"]["subject"]=about_each_prefix ) { this->rdf_parser["top"]["subject"]=this->rdf_parser["top"]["subject"] == '' ) { this->rdf_parser["top"]["subject"]=len = strlen( len > 0 ) { //bag_id ) { bag_id, id_buffer ) ); id_buffer; } // only report the type for non-rdf:Description elements. if( (namespace_uri != RDF_NAMESPACE_URI ) ) { namespace_uri; local_name; this->rdf_parser["top"]["subject_type"], type, '', this->rdf_parser["top"]["statements"], '' ); } // if this element is the child of some property, // report the appropriate statement. if( this->_report_statement( parent["parent"]["subject"], parent["ordinal"], RDF_OBJECT_TYPE_RESOURCE, parent["parent"]["bag_id"], parent["statement_id"] ); } if( this->_handle_property_attributes( this->rdf_parser["top"]["subject"], this->rdf_parser["top"]["xml_lang"], this->rdf_parser["top"]["statements"] ); } } function _handle_property_element( &local_name, &buffer=''; aux=aux2=Array(); foreach(atkey=>aux2[]=aux2[]=attributes=attribute_namespace_uri=''; attribute_value = ''; statement_id = ''; parse_type = ''; namespace_uri == RDF_NAMESPACE_URI ) { if( (this->_is_rdf_ordinal( this->rdf_parser["top"]["ordinal"] > this->rdf_parser["top"]["parent"]["members"] = this->_is_rdf_property_element( this->_report_warning( "unknown or out of context rdf property element: ".buffer=namespace_uri == RDF_NAMESPACE_URI ) && ( ordinal=''; this->rdf_parser["top"]["ordinal"] = this->rdf_parser["top"]["ordinal"]=ordinal{ 0 } = '_' ; this->rdf_parser["top"]["ordinal"]; } else { local_name; } buffer; this->rdf_parser["top"]["has_member_attributes"] = false; for( attributes[i += 2 ) { attributes[buffer, strlen( attribute_namespace_uri, attribute_value = i + 1]; // if the attribute is not in any namespace // or the attribute is in the RDF namespace if( ( attribute_namespace_uri == RDF_NAMESPACE_URI ) ) { if( ( statement_id = attribute_local_name == RDF_PARSE_TYPE ) { attribute_value; } else if( resource = attribute_local_name == RDF_BAG_ID ) { attribute_value; } else if( attribute_local_name ) ) { this->_report_warning( "unknown rdf attribute: ".attribute_namespace_uri == XML_NAMESPACE_URI ) { if( this->rdf_parser["top"]["xml_lang"] = attribute_namespace_uri ) { statement_id && this->_report_warning( "rdf:ID and rdf:resource are mutually exclusive" ); return; } if( this->_resolve_id(buffer, strlen( this->rdf_parser["top"]["statement_id"]=parse_type ) { if( this->_report_warning( "property elements with rdf:parseType do not allow rdf:resource" ); return; } if( this->_report_warning( "property elements with rdf:parseType do not allow rdf:bagID" ); return; } if( this->_report_warning( "property elements with rdf:parseType do not allow property attributes"); return; } if( this->_generate_anonymous_uri( buffer ) ); // since we are sure that this is now a resource property we can report it this->rdf_parser["top"]["parent"]["subject_type"], this->rdf_parser["top"]["predicate"], 0, RDF_OBJECT_TYPE_RESOURCE, this->rdf_parser["top"]["parent"]["bag_id"], statement_id ); this->rdf_parser["top"]["state"] = IN_PROPERTY_PARSE_TYPE_RESOURCE; this->rdf_parser["top"]["subject"]=this->rdf_parser["top"]["bag_id"]=''; } else { this->rdf_parser["top"]["parent"]["subject_type"], this->rdf_parser["top"]["predicate"], 0, RDF_OBJECT_TYPE_XML, '', '', this->rdf_parser["top"]["parent"]["statements"], this->rdf_parser["top"]["state"] = IN_PROPERTY_PARSE_TYPE_LITERAL; resource || this->rdf_parser["top"]["has_property_attributes"] ) { if( subject_type = RDF_SUBJECT_TYPE_URI; this->rdf_parser["base_uri"], buffer, strlen( subject_type = RDF_SUBJECT_TYPE_ANONYMOUS; buffer ) ); } this->_report_statement( this->rdf_parser["top"]["parent"]["subject"], this->rdf_parser["top"]["ordinal"], RDF_OBJECT_TYPE_RESOURCE, this->rdf_parser["top"]["parent"]["bag_id"], bag_id ) { bag_id, buffer ) ); buffer; } if( this->_handle_property_attributes( buffer, this->rdf_parser["top"]["xml_lang"], this->rdf_parser["top"]["statements"] ); } } } function _start_element_handler(name, buffer=''; local_name=''; /* if( rdf_parser->top != '' && rdf_parser->top->state != IN_TOP_LEVEL ) { ++rdf_parser->anonymous_id; } */ this->_split_name( buffer, strlen( namespace_uri, this->rdf_parser["top"]["state"] ) { case IN_TOP_LEVEL: if( RDF_NAMESPACE_URI.NAMESPACE_SEPARATOR_STRING.RDF_RDF == this->rdf_parser["top"]["state"] = IN_RDF; } else { name, this->rdf_parser["top"]["state"] = IN_DESCRIPTION; namespace_uri, attributes, '' ); break; case IN_DESCRIPTION: case IN_PROPERTY_PARSE_TYPE_RESOURCE: this->_handle_property_element( local_name, /* if we're in a property with an unknown object type and we encounter an element, the object must be a resource, */ this->rdf_parser["top"]["data"]=''; this->rdf_parser["top"]["state"] = IN_DESCRIPTION; namespace_uri, attributes, this->_report_warning( "no markup allowed in literals" ); break; case IN_PROPERTY_PARSE_TYPE_LITERAL: /* fall through */ case IN_XML: this->_report_start_element( attributes ); break; case IN_PROPERTY_RESOURCE: this->_report_warning( "no content allowed in property with rdf:resource, rdf:bagID, or property attributes" ); break; case IN_UNKNOWN: break; } } /* this is only called when we're in the IN_PROPERTY_UNKNOWN_OBJECT state. the only time we won't know what type of object a statement has is when we encounter property statements without property attributes or content: <foo:property /> <foo:property ></foo:property> <foo:property> </foo:property> notice that the state doesn't switch to IN_PROPERTY_LITERAL when there is only whitespace between the start and end tags. this isn't a very useful statement since the object is anonymous and can't have any statements with it as the subject but it is allowed. */ function _end_empty_resource_property() { this->_generate_anonymous_uri(buffer ) ); this->rdf_parser["top"]["parent"]["subject_type"], this->rdf_parser["top"]["predicate"], buffer, this->rdf_parser["top"]["parent"]["bag_id"], this->rdf_parser["top"]["statement_id"] ); } /* property elements with text only as content set the state to IN_PROPERTY_LITERAL. as character data is received from expat, it is saved in a buffer and reported when the end tag is received. */ function _end_literal_property() { if(!isset(this->rdf_parser["top"]["statement_id"]=''; } if(!isset(this->rdf_parser["top"]["parent"]["subject_type"]=''; } if(!isset(this->rdf_parser["top"]["parent"]["subject"]=''; } if(!isset(this->rdf_parser["top"]["parent"]["bag_id"]=''; } if(!isset(this->rdf_parser["top"]["parent"]["statements"]=0; } if(!isset(this->rdf_parser["top"]["predicate"]=''; } if(!isset(this->rdf_parser["top"]["ordinal"]=0; } this->rdf_parser["top"]["parent"]["subject_type"], this->rdf_parser["top"]["predicate"], this->rdf_parser["top"]["data"], this->rdf_parser["top"]["parent"]["bag_id"], this->rdf_parser["top"]["statement_id"] ); } function _end_element_handler( name ) { switch( /* fall through */ case IN_XML: this->_report_end_element( this->_end_empty_resource_property(); break; case IN_PROPERTY_LITERAL: this->_pop_element( ); break; case IN_PROPERTY_PARSE_TYPE_LITERAL: this->_pop_element(); } function _character_data_handler( s) { s); switch( this->rdf_parser["top"]["data"]) ) { this->rdf_parser["top"]["data"] ); s; } else { s; } if( /* look for non-whitespace */ for( i = 0; (( len ) && ( ereg(" |\n|\t",i }) )); i++; /* if we found non-whitespace, this is a literal */ if( len ) { this->_report_character_data( s) ); break; case IN_RDF: case IN_DESCRIPTION: case IN_PROPERTY_RESOURCE: case IN_PROPERTY_EMPTY_RESOURCE: case IN_PROPERTY_PARSE_TYPE_RESOURCE: case IN_UNKNOWN: break; } } /* public functions */ function rdf_parser_create( parser = xml_parser_create_ns( parser,XML_OPTION_CASE_FOLDING,0); parser; xml_set_object(this); xml_set_element_handler( this->rdf_parser["xml_parser"], "_character_data_handler" ); return z=3; // xml_parser_free( this->rdf_parser["base_uri"]=''; this->rdf_parser ); unset( user_data ) { user_data; } function rdf_get_user_data( ) { return ( user_data"] ); } function rdf_set_statement_handler(this->rdf_parser["statement_handler"] = start,this->rdf_parser["start_parse_type_literal_handler"] = this->rdf_parser["end_parse_type_literal_handler"] = start,this->rdf_parser["_start_element_handler"] = this->rdf_parser["_end_element_handler"] = handler) { handler; } function rdf_set_warning_handler(this->rdf_parser["warning_handler"] = s, is_final ) { return XML_Parse( s, this->rdf_parser["xml_parser"]); } function rdf_set_base(/* if( buffer[" tcslen( buffer ) - 1 "] != T( '#' ) ) { tcscat( buffer, T( "#" ) ); } */ /* check for out of memory */ this->rdf_parser["base_uri"]=this->rdf_parser["base_uri"]; } function rdf_resolve_uri(buffer) { _resolve_uri_reference( uri_reference, buffer) ); } } } ?>
(C) Æliens 20/2/2008
You may not copy or print any of this material without explicit permission of the author or the publisher. In case of other copyright issues, contact the author.