Per C++ non so se ci sono librerie giā pronte che fanno la stessa cosa, io mi ero fatto un parser usando TidyLib, si usa cosi:
Codice PHP:
Parser parser(fileName);
if (!parser.isValid()) return false;
// getText ritorna il testo contenuto nel nodo:
title = parser.getText("/html/body/div/table[2]/tr/td[2]/p/a/font/b/");
author = parser.getText("/html/body/div/table[2]/tr/td[2]/p/a[2]/font/b/");
description = parser.getText("/html/body/div/table[2]/tr/td[2]/p/");
// getValue ritorna il valore della proprieta specificata del nodo:
// (in questo caso č l'attributo HREF)
imageUrl = parser.getValue("/html/body/div/table[2]/tr/td[2]/a/", TidyAttr_HREF);
downloadUrl = parser.getValue("/html/body/div/table[2]/tr/td[2]/p/a/", TidyAttr_HREF);
qua c'č il codice della classe Parser, al posto di wxString puoi usare std::string se non usi wxWidget. per usarlo basta che includi il file parser.h nel tuo progetto e lo linki a tidylib.
Codice PHP:
#include <tidy.h>
#include <buffio.h>
#include <stdio.h>
#include <errno.h>
#include "wx/wx.h"
#include "wx/string.h"
class Parser {
TidyDoc tdoc;
bool valid;
bool walk(TidyNode ¤tNode, const char* tag, int pos);
void analizeToken(wxString &token, int &pos);
TidyNode getNode(wxString path);
public:
Parser();
Parser(const char* input);
bool open(const char* input);
wxString getValue(wxString path, TidyAttrId attr);
wxString getText(wxString path);
bool isValid() const;
~Parser();
};
Parser::Parser() {
valid=false;
}
Parser::Parser(const char* input) {
open(input);
}
bool Parser::open(const char* input) {
TidyBuffer output = {0};
TidyBuffer errbuf = {0};
int rc = -1;
Bool ok;
valid=false;
assert(input);
tdoc = tidyCreate(); // Initialize "document"
ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML
if ( ok )
rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics
if ( rc >= 0 )
rc = tidyParseString( tdoc, input ); // Parse the input
if ( rc >= 0 )
rc = tidyCleanAndRepair( tdoc ); // Tidy it up!
if ( rc >= 0 )
rc = tidyRunDiagnostics( tdoc ); // Kvetch
if ( rc > 1 ) // If error, force output.
rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
if ( rc >= 0 )
rc = tidySaveBuffer( tdoc, &output ); // Pretty Print
if ( rc >= 0 )
valid=true;
else
valid=false;
tidyBufFree( &output );
tidyBufFree( &errbuf );
return valid;
}
bool Parser::isValid() const {
return valid;
}
Parser::~Parser() {
tidyRelease( tdoc );
}
TidyNode Parser::getNode(wxString path) {
TidyNode currentNode=tidyGetRoot(tdoc);
int initTag,endTag,pos;
wxString token;
initTag=1;
endTag=path.find('/',initTag);
while (endTag!=wxString::npos) {
token=path.substr(initTag,endTag-initTag);
if (token.IsEmpty()) continue;
analizeToken(token, pos);
if (!walk(currentNode, token.mb_str(),pos)) return 0;
initTag=endTag+1;
endTag=path.find('/',initTag);
char* name=(char*)tidyNodeGetName(currentNode);
}
return currentNode;
}
wxString Parser::getValue(wxString path, TidyAttrId attr) {
wxString result;
TidyNode currentNode=getNode(path);
if (currentNode)
result=wxString((char*)tidyAttrValue(tidyAttrGetById(currentNode, attr)), wxConvUTF8);
return result;
}
wxString Parser::getText(wxString path) {
TidyNode currentNode=getNode(path);
wxString result;
if (!currentNode) return result;
for (TidyNode child=tidyGetChild(currentNode); child; child=tidyGetNext(child)) {
if (tidyNodeIsText(child)) {
TidyBuffer buf={0};
tidyNodeGetText(tdoc, child, &buf);
result+=wxString((char*)buf.bp, wxConvUTF8);
}
}
return result;
}
void Parser::analizeToken(wxString &token, int &pos) {
int pqa=token.find('[');
int pqb=token.find(']');
pos=1;
if (pqa!=wxString::npos && pqb!=wxString::npos) {
wxString stringpos=token.substr(pqa+1,pqb-pqa-1);
pos=atoi(stringpos.mb_str());
token=token.substr(0,pqa);
}
}
bool Parser::walk(TidyNode ¤tNode, const char* tag, int pos) {
assert(tag);
assert(pos>=1);
int index=0;
TidyNode child;
for ( child = tidyGetChild(currentNode); child; child = tidyGetNext(child) ) {
char* name=(char*)tidyNodeGetName(child);
if (!name) continue;
if (!strcmp(name, tag)) index++;
if (index==pos) {
currentNode=child;
return true;
}
}
return false;
}