#include "LinkGenerator.h" //#include "../link41b/include/link-includes.h" using namespace std; LinkGenerator::LinkGenerator(void) { } LinkGenerator::~LinkGenerator(void) { } DependencyStructure LinkGenerator::Generate(vector article) { DependencyStructure q; return q; } DependencyStructure LinkGenerator::Generate(string filename) { /* For each sentence, it will - integrate the words with the DependencyStructure - Add dependencies based on the link generator */ dependencies.open("dependencies.csv"); if (!dependencies.is_open()) { cout << "Unable to open dependencies file" << endl; char* xxxx; cin >> xxxx; } dependencies << "\"Left Index\",\"Left Word\",\"Label\",\"Label Description\",\"Right Index\",\"Right Word\"" << endl; string article = string(); string word = string(); fstream articleFile; articleFile.open(filename.c_str()); while (!articleFile.eof()) { articleFile >> word; article.append(word); article.append(" "); } Dictionary dict = dictionary_create("../link41b/data/4.0.dict", "../link41b/data/4.0.knowledge", NULL, "../link41b/data/4.0.affix"); Parse_Options opts = parse_options_create(); parse_options_set_max_parse_time(opts, 30); s = DependencyStructure(); int position = 0; int starting = 0; vector location = vector(); string sentence = string(); while (nextSentence(article,sentence)) { LinksToDependencies(sentence, dict, opts); sentence = string(); } dependencies.close(); return s; } bool LinkGenerator::nextSentence(string &article, string &sentence) { if (article.length() == 0) return false; int end = 0; for (int i = 0; i < article.length(); i++) { if (i > 2) { if (((article[i-1] == '.') || (article[i-1] == '?') || (article[i-1] == '!')) && ((article[i] == ' ') || (article[i] == '\n') || (article[i] == '\t'))){// it is punctuation with a space after if ((i > 4) && !(article[i-2] == '.') && !((article[i-2] == 'm') && (article[i-1] == 'r')) && !((article[i-2] == 'm') && (article[i-1] == 's')) && !((article[i-3] == 'm') && (article[i-2] == 'r') && (article[i-1] == 's')) && !((article[i-2] == 'd') && (article[i-1] == 'r')) ) { end = i; break; } } } } int len = article.length(); if (end == 0) { return false; } if (end == article.length()-1) { sentence = article; article = ""; return true; } sentence = article.substr(0,end); article = article.substr(end,-1); return true; } /* GetNextSentence will use an article of words, and a position to begin within the article to generate a string of the next sentence and to generate a mapping of locations for words that represent the same subject. */ bool LinkGenerator::GetNextSentence(int &position, int &starting, string &sentence, vector &location) { starting = position; vector article = a; // TODO: renaming moo while (position < article.size()) { // check the position // the first priority is to construct the sentence. sentence.append(" "); sentence.append(article[position].word); if (article[position].tag[0] == 'N') { // if the WordItem is a noun, try to find it first location.push_back(s.Find(article[position].word)); if (location[position] == -1) // if it's not found, just add it location[position] = s.AddWord(article[position]); } else { // if it's not a noun, just add it location.push_back(s.AddWord(article[position])); if (article[position].word == ".") break; //else // printf("|%s|", article[position].word); } position++; } position++; if (sentence != "") return true; else return false; } string LinkGenerator::linkLabelBeginning(string label) { for (int i = 1; i < label.size(); i++) { if ((label[i] < 'A') || (label[i] > 'Z')) string batty = label.substr(0,i); return label.substr(0,i); } return label; } void LinkGenerator::LinksToDependencies(string sentence, Dictionary dict, Parse_Options opts) { // Check to make sure this does what I think it does // Is there a more effective way? // Convert the stl::string to a non-const char* parse_options_reset_resources(opts); char* sentenceChars = new char[sentence.size()+1]; for (int i = 0; i < sentence.size(); i++) { sentenceChars[i] = sentence[i]; } sentenceChars[sentence.size()] = '\0'; Sentence sent = sentence_create(sentenceChars, dict); int num_linkages = sentence_parse(sent, opts); int num_words = sentence_length(sent); int starting = s.Verticies.size()-1; for (int i = 1; i < sentence_length(sent)-1; i++) { s.AddWord(WordItem(sentence_get_word(sent,i),"")); } if (VERBOSITY > 1) cout << num_linkages << " linkages in: " << sentence << endl; for (int q = 0; q < num_linkages; q += num_linkages) { Linkage linkage = linkage_create(q, sent, opts); linkage_compute_union(linkage); linkage_set_current_sublinkage(linkage, linkage_get_num_sublinkages(linkage)-1); int links = linkage_get_num_links(linkage); int l, r; string label; for (int j = 0; j < links; j++) { label = linkLabelBeginning(linkage_get_link_label(linkage,j)); l = linkage_get_link_lword(linkage,j)+starting; r = linkage_get_link_rword(linkage,j)+starting; //printf(" %s --> %s\n", linkage_get_word(linkage,linkage_get_link_lword(linkage,j)), linkage_get_word(linkage,linkage_get_link_rword(linkage,j))); //printf(" %s --> %s\n", linkage,linkage_get_link_llabel(linkage,j), linkage_get_link_rlabel(linkage,j)); if ((label == "A") // Adj->Noun || (label == "AM") // "as much" etc.. || (label == "AN") // noun-modifiers->Nouns || (label == "AZ") // As->Verb || (label == "B") // rel. clauses || (label == "BI") // "be" to some idiomatic expression || (label == "BT") // fronted objects in time expressions || (label == "BW") // What, whatever -> Verbs || (label == "C") // Coordinating Conjunction -> Subject || (label == "CC") // Clauses to Coordinating Conjunctions || (label == "D") // Determiners -> Nouns || (label == "DD") // Definitive Determiners -> #s or Adj. as Nouns || (label == "DG") // Determiner -> Proper Noun || (label == "DP") // Posessive Determiner -> Gerund || (label == "DT") // Determiners -> Nouns in time expressions || (label == "E") // verb-modifying adverbs || (label == "EA") // Adverbs -> Adjectives || (label == "EB") // Adverbs -> forms of "be" || (label == "EC") // Adverbs -> comparative adjectives || (label == "EE") // Adverbs -> Adverbs || (label == "EF") // "Enough" -> Adj/Adverbs || (label == "EI") // Adverbs (eg soon, shortly) -> after, before || (label == "EL") // ????? ____ -> Else || (label == "EN") // Adverbs -> Quantities || (label == "EZ") // Adverbs -> "as" || (label == "FL") // for -> long || (label == "FM") // from -> preposition || (label == "GN") // common noun -> proper noun || (label == "I") // ????? verbs -> Infinitives || (label == "IN") // in -> idiomatic time expressions || (label == "J") // prepositions -> Objects || (label == "JG") // prepositions -> Proper nouns: "of science" || (label == "JT") // conjunctions -> time expressions || (label == "M") // nouns -> modifiers || (label == "MG") // certain prepositions -> nouns || (label == "MV") // verbs -> modifying phrases || (label == "MX") // noun -> modifiers || (label == "ND") // numerical determiner -> expressions || (label == "NF") // fractional words -> of || (label == "NI") // special idiomatic number phrase -> number || (label == "NJ") // used with NF || (label == "NN") // connects number words in a series || (label == "NR") // fraction words -> superlatives || (label == "NS") // for singular expressions: Numerical determiner -> expression || (label == "NT") // not -> to || (label == "NW") // numbers -> fractional numbers || (label == "O") // transitive verbs -> objects || (label == "OD") // rise/fall -> distances || (label == "OF") // certain verb/adjectives -> of || (label == "ON") // on -> time expressions || (label == "OT") // verbs -> time expressions (as objects) || (label == "ON") // on -> time expressions || (label == "P") // be verbs -> complements || (label == "PP") // have -> past participles || (label == "R") // nouns -> relative clauses || (label == "RS") // start of a relative clause -> subject of || (label == "S") // subject nouns -> finite verbs || (label == "SI") // verbs -> subjects in subject-verb inversion || (label == "SX") // I -> was/am || (label == "SXI") // I -> was/am in subject verb inversion || (label == "TA") // adjectives -> month names || (label == "TD") // day-of-the-week -> morning|afternoon|evening || (label == "TA") // adjectives -> month names || (label == "TH") // verbs -> that[clause] || (label == "TI") // titles || (label == "TS") // certain verbs -> conjunctive clauses || (label == "TT") // "time" -> previous adjective || (label == "TW") // days of the week -> month names || (label == "U") // subject-object/determiner -> noun || (label == "UN") // until|since -> time phrases || (label == "WN") // when phrases -> time nouns || (label == "Y") // idiomatic time + place expressions <- || (label == "YS") // 's -> noun || (label == "G") // proper noun in a series || (label == "ID") // idiomatic strings || (label == "TM") // month names -> day numbers ) { if ((l < s.Verticies.size()) && (r < s.Verticies.size())) { char* linflected = linkage_get_word(linkage,l-starting); char* rinflected = linkage_get_word(linkage,r-starting); if ((strlen(linflected) >= 2) && (linflected[strlen(linflected)-2] == '.')) s.Verticies[l].tag = linflected[strlen(linflected)-1]; if ((strlen(rinflected) >= 2) && (linflected[strlen(rinflected)-2] == '.')) s.Verticies[l].tag = linflected[strlen(rinflected)-1]; s.Edges.push_back(Dependency(l,r,label)); if (VERBOSITY > 3) printf("#%-4d%-10s -- %-40s -> #%-4d%-10s\n",l,sentence_get_word(sent,l-starting),linkLabelDescription(label).c_str(),r,sentence_get_word(sent,r-starting)); dependencies << l << ",\"" << sentence_get_word(sent,l-starting) << "\"," << label << ",\"" << linkLabelDescription(label) << "\"," << r << ",\"" << sentence_get_word(sent,r-starting) << "\"" << endl; } } /* || (label == "G") // proper noun in a series || (label == "ID") // proper noun in a series || (label == "TM") // month names -> day numbers make them the same */ } linkage_delete(linkage); } sentence_delete(sent); // now review all the words in the sentence, looking for duplicate objects. int n; for (int i = starting; i < s.Verticies.size(); i++) { n = s.Find(s.Verticies[i]); if ((s.Verticies[i].tag.compare("n") == 0) && (n != i)) { // if it finds one, replace its references in the edges. for (int j = 0; j < s.Edges.size(); j++) { if (s.Edges[j].child == i) s.Edges[j].child = n; if (s.Edges[j].parent == i) s.Edges[j].parent = n; } } } } string LinkGenerator::linkLabelDescription(string label) { if (label == "A") return "Adj->Noun"; else if (label == "AM") return "'as much' etc.."; else if (label == "AN") return "noun-modifiers->Nouns"; else if (label == "AZ") return "As->Verb"; else if (label == "B") return "rel. clauses"; else if (label == "BI") return "'be' to some idiomatic expression"; else if (label == "BT") return "fronted objects in time expressions"; else if (label == "BW") return "What, whatever -> Verbs"; else if (label == "C") return "Coordinating Conjunction -> Subject"; else if (label == "CC") return "Clauses to Coordinating Conjunctions"; else if (label == "D") return "Determiners -> Nouns"; else if (label == "DD") return "Definitive Determiners -> #s or Adj. as Nouns"; else if (label == "DG") return "Determiner -> Proper Noun"; else if (label == "DP") return "Posessive Determiner -> Gerund"; else if (label == "DT") return "Determiners -> Nouns in time expressions"; else if (label == "E") return "verb-modifying adverbs"; else if (label == "EA") return "Adverbs -> Adjectives"; else if (label == "EB") return "Adverbs -> forms of 'be'"; else if (label == "EC") return "Adverbs -> comparative adjectives"; else if (label == "EE") return "Adverbs -> Adverbs"; else if (label == "EF") return "'Enough' -> Adj/Adverbs"; else if (label == "EI") return "Adverbs (eg soon, shortly) -> after, before"; else if (label == "EL") return "????? ____ -> Else"; else if (label == "EN") return "Adverbs -> Quantities"; else if (label == "EZ") return "Adverbs -> 'as'"; else if (label == "FL") return "for -> long"; else if (label == "FM") return "from -> preposition"; else if (label == "GN") return "common noun -> proper noun"; else if (label == "I") return "????? verbs -> Infinitives"; else if (label == "IN") return "in -> idiomatic time expressions"; else if (label == "J") return "prepositions -> Objects"; else if (label == "JG") return "prepositions -> Proper nouns: 'of science'"; else if (label == "JT") return "conjunctions -> time expressions"; else if (label == "M") return "nouns -> modifiers"; else if (label == "MG") return "certain prepositions -> nouns"; else if (label == "MV") return "verbs -> modifying phrases"; else if (label == "MX") return "noun -> modifiers"; else if (label == "ND") return "numerical determiner -> expressions"; else if (label == "NF") return "fractional words -> of"; else if (label == "NI") return "special idiomatic number phrase -> number"; else if (label == "NJ") return "used with NF"; else if (label == "NN") return "connects number words in a series"; else if (label == "NR") return "fraction words -> superlatives"; else if (label == "NS") return "singular expressions: Numerical determiner -> expression"; else if (label == "NT") return "not -> to"; else if (label == "NW") return "numbers -> fractional numbers"; else if (label == "O") return "transitive verbs -> objects"; else if (label == "OD") return "rise/fall -> distances"; else if (label == "OF") return "certain verb/adjectives -> of"; else if (label == "ON") return "on -> time expressions"; else if (label == "OT") return "verbs -> time expressions (as objects)"; else if (label == "ON") return "on -> time expressions"; else if (label == "P") return "be verbs -> complements"; else if (label == "PP") return "have -> past participles"; else if (label == "R") return "nouns -> relative clauses"; else if (label == "RS") return "start of a relative clause -> subject of"; else if (label == "S") return "subject nouns -> finite verbs"; else if (label == "SI") return "verbs -> subjects in subject-verb inversion"; else if (label == "SX") return "I -> was/am"; else if (label == "SXI") return "I -> was/am in subject verb inversion"; else if (label == "TA") return "adjectives -> month names"; else if (label == "TD") return "day-of-the-week -> morning|afternoon|evening"; else if (label == "TA") return "adjectives -> month names"; else if (label == "TH") return "verbs -> that[clause]"; else if (label == "TI") return "titles"; else if (label == "TS") return "certain verbs -> conjunctive clauses"; else if (label == "TT") return "'time' -> previous adjective"; else if (label == "TW") return "days of the week -> month names"; else if (label == "U") return "subject-object/determiner -> noun"; else if (label == "UN") return "until|since -> time phrases"; else if (label == "WN") return "when phrases -> time nouns"; else if (label == "Y") return "idiomatic time + place expressions <-"; else if (label == "YS") return "'s -> noun"; else if (label == "G") return "proper noun in a series"; else if (label == "ID") return "proper noun in a series"; else if (label == "TM") return "month names -> day numbers"; else return "UNKNOWN"; }