Welcome toVigges Developer Community-Open, Learning,Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
580 views
in Technique[技术] by (71.8m points)

mapreduce - Merging two collections in MongoDB

I've been trying to use MapReduce in MongoDB to do what I think is a simple procedure. I don't know if this is the right approach, of if I should even be using MapReduce. I googled what keywords I thought of and tried to hit the docs where I thought I would have the most success - but nothing. Maybe I'm thinking too hard about this?

I have two collections: details and gpas

details is made up of a whole bunch of documents (3+ million). The studentid element can be repeated two times, one for each year, like the following:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640100"), "classes" : [1,17,19,21], "studentid" : "12345a", "year" : 1}
{ "_id" : ObjectId("4d76b7oij7s2d8372v640100"), "classes" : [2,12,19,22], "studentid" : "98765a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8372v640100"), "classes" : [32,91,101,217], "studentid" : "12345a", "year" : 2}
{ "_id" : ObjectId("4d76b7rty7s2d8372v640100"), "classes" : [1,11,18,22], "studentid" : "24680a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8856v640100"), "classes" : [32,99,110,215], "studentid" : "98765a", "year" : 2}
...

gpas has elements with the same studentid's from details. Only one entry per studentid, like this:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "overall" : 76, "subscore": 2}
...

In the end I want to have a collection with one row for each student in this format:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "classes_1": [1,17,19,21], "classes_2": [32,91,101,217], "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "classes_1": [2,12,19,22], "classes_2": [32,99,110,215], "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "classes_1": [1,11,18,22], "classes_2": [], "overall" : 76, "subscore": 2}
...

The way I was going to do this was by running MapReduce like this:

var mapDetails = function() {
    emit(this.studentid, {studentid: this.studentid, classes: this.classes, year: this.year, overall: 0, subscore: 0});
};

var mapGpas = function() {
    emit(this.studentid, {studentid: this.studentid, classes: [], year: 0, overall: this.overall, subscore: this.subscore});
};

var reduce = function(key, values) {
    var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};

    values.forEach(function(value) {
        if (value.year == 0) {
            outs.overall = value.overall;
            outs.subscore = value.subscore;
        }
        else {
            if (value.year == 1) {
                outs.classes_1 = value.classes;
            }
            if (value.year == 2) {
                outs.classes_2 = value.classes;
            }

            outs.studentid = value.studentid;
        }
    });

    return outs;

};

res = db.details.mapReduce(mapDetails, reduce, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, reduce, {out: {reduce: 'joined'}})

But when I run it, this is my resulting collection:

{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 85, "subscore" : 5 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }

I'm missing the classes arrays.

Also, as an aside, how do I access the elements in resulting MapReduce value element? Does MapReduce always output to value or whatever else you name it?

Question&Answers:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

This is similar to a question that was asked on the MongoDB-users Google Groups.
https://groups.google.com/group/mongodb-user/browse_thread/thread/60a8b683e2626ada?pli=1

The answer references an on-line tutorial which looks similar to your example: http://tebros.com/2011/07/using-mongodb-mapreduce-to-join-2-collections/

For more information on MapReduce in MongoDB, please see the documentation: http://www.mongodb.org/display/DOCS/MapReduce

Additionally, there is a useful step-by-step walkthrough of how a MapReduce operation works in the "Extras" Section of the MongoDB Cookbook article titled, "Finding Max And Min Values with Versioned Documents": http://cookbook.mongodb.org/patterns/finding_max_and_min/

Forgive me if you have already read some of the referenced documents. I have included them for the benefit of other users who may be reading this post and new to using MapReduce in MongoDB

It is important that the outputs from the 'emit' statements in the Map functions match the outputs of the Reduce function. If there is only one document output by the Map function, the Reduce function might not be run at all, and then your output collection will have mismatched documents.

I have slightly modified your map statements to emit documents in the format of your desired output, with two separate "classes" arrays.
I have also reworked your reduce statement to add new classes to the classes_1 and classes_2 arrays, only if they do not already exist.

var mapDetails = function(){
    var output = {studentid: this.studentid, classes_1: [], classes_2: [], year: this.year, overall: 0, subscore: 0}
    if (this.year == 1) {
        output.classes_1 = this.classes;
    }
    if (this.year == 2) {
        output.classes_2 = this.classes;
    }
    emit(this.studentid, output);
};

var mapGpas = function() {
    emit(this.studentid, {studentid: this.studentid, classes_1: [], classes_2: [], year: 0, overall: this.overall, subscore: this.subscore});
};

var r = function(key, values) {
    var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};

    values.forEach(function(v){
        outs.studentid = v.studentid;
        v.classes_1.forEach(function(class){if(outs.classes_1.indexOf(class)==-1){outs.classes_1.push(class)}})
        v.classes_2.forEach(function(class){if(outs.classes_2.indexOf(class)==-1){outs.classes_2.push(class)}})

        if (v.year == 0) {
            outs.overall = v.overall;
            outs.subscore = v.subscore;
        }
    });
    return outs;
};

res = db.details.mapReduce(mapDetails, r, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, r, {out: {reduce: 'joined'}})

Running the two MapReduce operations results in the following collection, which matches your desired format:

> db.joined.find()
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ 1, 17, 19, 21 ], "classes_2" : [ 32, 91, 101, 217 ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ 1, 11, 18, 22 ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ 2, 12, 19, 22 ], "classes_2" : [ 32, 99, 110, 215 ], "overall" : 85, "subscore" : 5 } }
>

MapReduce always outputs documents in the form of {_id:"id", value:"value"} There is more information available on working with sub-documents in the document titled, "Dot Notation (Reaching into Objects)": http://www.mongodb.org/display/DOCS/Dot+Notation+%28Reaching+into+Objects%29

If you would like the output of MapReduce to appear in a different format, you will have to do that programmatically in your application.

Hopefully this will improve your understanding of MapReduce, and get you one step closer to producing your desired output collection. Good Luck!


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to Vigges Developer Community for programmer and developer-Open, Learning and Share
...